Merge remote-tracking branch 'block/for-next'

Conflicts: arch/cris/include/asm/Kbuild arch/um/include/asm/Kbuild drivers/block/nvme-core.c drivers/md/dm.c include/linux/blkdev.h mm/backing-dev.c mm/page-writeback.c
author: Michael Ellerman <mpe@ellerman.id.au> 2015-06-05 16:08:29 +1000
committer: Michael Ellerman <mpe@ellerman.id.au> 2015-06-05 16:08:29 +1000
commit: a71e49de186dc0b08f5218b783cbe64ded9cd983 (patch)
tree: b53e89ab4d39937bd36b022ee8d1b2e37e2179af
parent: 5cd8ae4c0ec4193aece58d0460eab2a86b0e2187 (diff)
parent: 613a2e5fb8db8dae722ee80dabc216bb65cb1a86 (diff)
download: linux-next-a71e49de186dc0b08f5218b783cbe64ded9cd983.tar.gz
181 files changed, 5173 insertions, 3301 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index f456b4315e86d8..ff71e16cc75248 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -493,6 +493,7 @@ pgpgin		- # of charging events to the memory cgroup. The charging
 pgpgout		- # of uncharging events to the memory cgroup. The uncharging
 		event happens each time a page is unaccounted from the cgroup.
 swap		- # of bytes of swap usage
+dirty		- # of bytes that are waiting to get written back to the disk.
 writeback	- # of bytes of file/anon cache that are queued for syncing to
 		disk.
 inactive_anon	- # of bytes of anonymous and swap cache memory on inactive
diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild
index 76aeb8fa551a83..cde23cd03609af 100644
--- a/arch/alpha/include/asm/Kbuild
+++ b/arch/alpha/include/asm/Kbuild
@@ -6,6 +6,5 @@ generic-y += exec.h
 generic-y += irq_work.h
 generic-y += mcs_spinlock.h
 generic-y += preempt.h
-generic-y += scatterlist.h
 generic-y += sections.h
 generic-y += trace_clock.h
diff --git a/arch/alpha/include/asm/pci.h b/arch/alpha/include/asm/pci.h
index f7f680f7457dec..bf7d97dce9e81e 100644
--- a/arch/alpha/include/asm/pci.h
+++ b/arch/alpha/include/asm/pci.h
@@ -5,7 +5,7 @@
 
 #include <linux/spinlock.h>
 #include <linux/dma-mapping.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 #include <asm/machvec.h>
 #include <asm-generic/pci-bridge.h>
 
diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild
index be0c39e76f7cdf..769b312c1abb54 100644
--- a/arch/arc/include/asm/Kbuild
+++ b/arch/arc/include/asm/Kbuild
@@ -33,7 +33,6 @@ generic-y += poll.h
 generic-y += posix_types.h
 generic-y += preempt.h
 generic-y += resource.h
-generic-y += scatterlist.h
 generic-y += sembuf.h
 generic-y += shmbuf.h
 generic-y += siginfo.h
diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild
index 3c4596d0ce6cbd..83c50193626ce0 100644
--- a/arch/arm/include/asm/Kbuild
+++ b/arch/arm/include/asm/Kbuild
@@ -20,7 +20,6 @@ generic-y += poll.h
 generic-y += preempt.h
 generic-y += resource.h
 generic-y += rwsem.h
-generic-y += scatterlist.h
 generic-y += seccomp.h
 generic-y += sections.h
 generic-y += segment.h
diff --git a/arch/arm/include/asm/dma.h b/arch/arm/include/asm/dma.h
index 99084431d6ae26..bb4fa67da54193 100644
--- a/arch/arm/include/asm/dma.h
+++ b/arch/arm/include/asm/dma.h
@@ -19,7 +19,7 @@
  * It should not be re-used except for that purpose.
  */
 #include <linux/spinlock.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 
 #include <mach/isa-dma.h>
 
diff --git a/arch/arm/mach-footbridge/dma.c b/arch/arm/mach-footbridge/dma.c
index e2e0df8bcee27d..22536b85a81d53 100644
--- a/arch/arm/mach-footbridge/dma.c
+++ b/arch/arm/mach-footbridge/dma.c
@@ -13,9 +13,9 @@
 #include <linux/init.h>
 #include <linux/io.h>
 #include <linux/spinlock.h>
+#include <linux/scatterlist.h>
 
 #include <asm/dma.h>
-#include <asm/scatterlist.h>
 
 #include <asm/mach/dma.h>
 #include <asm/hardware/dec21285.h>
diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild
index 55103e50c51b6b..b112a39834d046 100644
--- a/arch/arm64/include/asm/Kbuild
+++ b/arch/arm64/include/asm/Kbuild
@@ -35,7 +35,6 @@ generic-y += poll.h
 generic-y += preempt.h
 generic-y += resource.h
 generic-y += rwsem.h
-generic-y += scatterlist.h
 generic-y += sections.h
 generic-y += segment.h
 generic-y += sembuf.h
diff --git a/arch/avr32/include/asm/Kbuild b/arch/avr32/include/asm/Kbuild
index 528d70d47a540d..1d66afdfac079b 100644
--- a/arch/avr32/include/asm/Kbuild
+++ b/arch/avr32/include/asm/Kbuild
@@ -15,7 +15,6 @@ generic-y += mcs_spinlock.h
 generic-y += param.h
 generic-y += percpu.h
 generic-y += preempt.h
-generic-y += scatterlist.h
 generic-y += sections.h
 generic-y += topology.h
 generic-y += trace_clock.h
diff --git a/arch/blackfin/include/asm/Kbuild b/arch/blackfin/include/asm/Kbuild
index 4bd3c3cfc9ab3a..07051a63415d71 100644
--- a/arch/blackfin/include/asm/Kbuild
+++ b/arch/blackfin/include/asm/Kbuild
@@ -29,7 +29,6 @@ generic-y += percpu.h
 generic-y += pgalloc.h
 generic-y += preempt.h
 generic-y += resource.h
-generic-y += scatterlist.h
 generic-y += sembuf.h
 generic-y += serial.h
 generic-y += setup.h
diff --git a/arch/blackfin/include/asm/pci.h b/arch/blackfin/include/asm/pci.h
index c737909fba4705..14efc0db1ade13 100644
--- a/arch/blackfin/include/asm/pci.h
+++ b/arch/blackfin/include/asm/pci.h
@@ -3,7 +3,7 @@
 #ifndef _ASM_BFIN_PCI_H
 #define _ASM_BFIN_PCI_H
 
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 #include <asm-generic/pci-dma-compat.h>
 #include <asm-generic/pci.h>
 
diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild
index ae0a51f5376cef..7aeb322729754f 100644
--- a/arch/c6x/include/asm/Kbuild
+++ b/arch/c6x/include/asm/Kbuild
@@ -38,7 +38,6 @@ generic-y += poll.h
 generic-y += posix_types.h
 generic-y += preempt.h
 generic-y += resource.h
-generic-y += scatterlist.h
 generic-y += segment.h
 generic-y += sembuf.h
 generic-y += serial.h
diff --git a/arch/cris/include/asm/Kbuild b/arch/cris/include/asm/Kbuild
index 2fc4331a69ca76..0e9648507fb605 100644
--- a/arch/cris/include/asm/Kbuild
+++ b/arch/cris/include/asm/Kbuild
@@ -31,7 +31,6 @@ generic-y += percpu.h
 generic-y += poll.h
 generic-y += preempt.h
 generic-y += resource.h
-generic-y += scatterlist.h
 generic-y += sections.h
 generic-y += sembuf.h
 generic-y += shmbuf.h
diff --git a/arch/cris/include/asm/dma-mapping.h b/arch/cris/include/asm/dma-mapping.h
index 2f0f654f1b4407..57f794ee6039d0 100644
--- a/arch/cris/include/asm/dma-mapping.h
+++ b/arch/cris/include/asm/dma-mapping.h
@@ -5,10 +5,10 @@
 
 #include <linux/mm.h>
 #include <linux/kernel.h>
+#include <linux/scatterlist.h>
 
 #include <asm/cache.h>
 #include <asm/io.h>
-#include <asm/scatterlist.h>
 
 #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
 #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
diff --git a/arch/cris/include/asm/pci.h b/arch/cris/include/asm/pci.h
index cc2399c175e9ea..c15b4b4baafaa8 100644
--- a/arch/cris/include/asm/pci.h
+++ b/arch/cris/include/asm/pci.h
@@ -29,7 +29,7 @@ int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
 
 #include <linux/types.h>
 #include <linux/slab.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 #include <linux/string.h>
 #include <asm/io.h>
 
diff --git a/arch/frv/include/asm/Kbuild b/arch/frv/include/asm/Kbuild
index e3f81b53578e3b..30edce31e5c21e 100644
--- a/arch/frv/include/asm/Kbuild
+++ b/arch/frv/include/asm/Kbuild
@@ -5,5 +5,4 @@ generic-y += exec.h
 generic-y += irq_work.h
 generic-y += mcs_spinlock.h
 generic-y += preempt.h
-generic-y += scatterlist.h
 generic-y += trace_clock.h
diff --git a/arch/frv/include/asm/dma-mapping.h b/arch/frv/include/asm/dma-mapping.h
index 1746a2b8e6e728..2840adcd6d928c 100644
--- a/arch/frv/include/asm/dma-mapping.h
+++ b/arch/frv/include/asm/dma-mapping.h
@@ -2,9 +2,9 @@
 #define _ASM_DMA_MAPPING_H
 
 #include <linux/device.h>
+#include <linux/scatterlist.h>
 #include <asm/cache.h>
 #include <asm/cacheflush.h>
-#include <asm/scatterlist.h>
 #include <asm/io.h>
 
 /*
diff --git a/arch/frv/include/asm/pci.h b/arch/frv/include/asm/pci.h
index 2035a4d3f9b98c..43d224685144a5 100644
--- a/arch/frv/include/asm/pci.h
+++ b/arch/frv/include/asm/pci.h
@@ -14,7 +14,7 @@
 #define _ASM_FRV_PCI_H
 
 #include <linux/mm.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 #include <asm-generic/pci-dma-compat.h>
 #include <asm-generic/pci.h>
 
diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild
index c7a99f860b40f9..5ade4a163558f9 100644
--- a/arch/hexagon/include/asm/Kbuild
+++ b/arch/hexagon/include/asm/Kbuild
@@ -37,7 +37,6 @@ generic-y += posix_types.h
 generic-y += preempt.h
 generic-y += resource.h
 generic-y += rwsem.h
-generic-y += scatterlist.h
 generic-y += sections.h
 generic-y += segment.h
 generic-y += sembuf.h
diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild
index 9b41b4bcc073b9..ccff13d33fa21e 100644
--- a/arch/ia64/include/asm/Kbuild
+++ b/arch/ia64/include/asm/Kbuild
@@ -5,6 +5,5 @@ generic-y += irq_work.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
 generic-y += preempt.h
-generic-y += scatterlist.h
 generic-y += trace_clock.h
 generic-y += vtime.h
diff --git a/arch/ia64/include/asm/pci.h b/arch/ia64/include/asm/pci.h
index 52af5ed9f60ba9..0c0e25d6427a19 100644
--- a/arch/ia64/include/asm/pci.h
+++ b/arch/ia64/include/asm/pci.h
@@ -6,9 +6,9 @@
 #include <linux/spinlock.h>
 #include <linux/string.h>
 #include <linux/types.h>
+#include <linux/scatterlist.h>
 
 #include <asm/io.h>
-#include <asm/scatterlist.h>
 #include <asm/hw_irq.h>
 
 struct pci_vector_struct {
diff --git a/arch/m32r/include/asm/Kbuild b/arch/m32r/include/asm/Kbuild
index 2edc793372fc13..ba1cdc01873151 100644
--- a/arch/m32r/include/asm/Kbuild
+++ b/arch/m32r/include/asm/Kbuild
@@ -6,6 +6,5 @@ generic-y += irq_work.h
 generic-y += mcs_spinlock.h
 generic-y += module.h
 generic-y += preempt.h
-generic-y += scatterlist.h
 generic-y += sections.h
 generic-y += trace_clock.h
diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild
index 1517ed1c647110..1555bc189c7d65 100644
--- a/arch/m68k/include/asm/Kbuild
+++ b/arch/m68k/include/asm/Kbuild
@@ -23,7 +23,6 @@ generic-y += mutex.h
 generic-y += percpu.h
 generic-y += preempt.h
 generic-y += resource.h
-generic-y += scatterlist.h
 generic-y += sections.h
 generic-y += shmparam.h
 generic-y += siginfo.h
diff --git a/arch/metag/include/asm/Kbuild b/arch/metag/include/asm/Kbuild
index 0bf5d525b945d3..199320f3c34528 100644
--- a/arch/metag/include/asm/Kbuild
+++ b/arch/metag/include/asm/Kbuild
@@ -33,7 +33,6 @@ generic-y += percpu.h
 generic-y += poll.h
 generic-y += posix_types.h
 generic-y += preempt.h
-generic-y += scatterlist.h
 generic-y += sections.h
 generic-y += sembuf.h
 generic-y += serial.h
diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild
index ab564a6db5c32a..9989ddb169cab9 100644
--- a/arch/microblaze/include/asm/Kbuild
+++ b/arch/microblaze/include/asm/Kbuild
@@ -7,6 +7,5 @@ generic-y += exec.h
 generic-y += irq_work.h
 generic-y += mcs_spinlock.h
 generic-y += preempt.h
-generic-y += scatterlist.h
 generic-y += syscalls.h
 generic-y += trace_clock.h
diff --git a/arch/microblaze/include/asm/pci.h b/arch/microblaze/include/asm/pci.h
index 468aca8cec0d32..81f27aef979c48 100644
--- a/arch/microblaze/include/asm/pci.h
+++ b/arch/microblaze/include/asm/pci.h
@@ -16,8 +16,8 @@
 #include <linux/string.h>
 #include <linux/dma-mapping.h>
 #include <linux/pci.h>
+#include <linux/scatterlist.h>
 
-#include <asm/scatterlist.h>
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/pci-bridge.h>
diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild
index 526539cbc99f67..7fe5c61a3cb83a 100644
--- a/arch/mips/include/asm/Kbuild
+++ b/arch/mips/include/asm/Kbuild
@@ -11,7 +11,6 @@ generic-y += mutex.h
 generic-y += parport.h
 generic-y += percpu.h
 generic-y += preempt.h
-generic-y += scatterlist.h
 generic-y += sections.h
 generic-y += segment.h
 generic-y += serial.h
diff --git a/arch/mips/include/asm/dma-mapping.h b/arch/mips/include/asm/dma-mapping.h
index fd1b4a15075991..360b3387182af2 100644
--- a/arch/mips/include/asm/dma-mapping.h
+++ b/arch/mips/include/asm/dma-mapping.h
@@ -1,7 +1,7 @@
 #ifndef _ASM_DMA_MAPPING_H
 #define _ASM_DMA_MAPPING_H
 
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 #include <asm/dma-coherence.h>
 #include <asm/cache.h>
 #include <asm-generic/dma-coherent.h>
diff --git a/arch/mips/include/asm/pci.h b/arch/mips/include/asm/pci.h
index d9692993fc8337..3413b10d833bc9 100644
--- a/arch/mips/include/asm/pci.h
+++ b/arch/mips/include/asm/pci.h
@@ -99,7 +99,7 @@ static inline void pci_resource_to_user(const struct pci_dev *dev, int bar,
 
 #include <linux/types.h>
 #include <linux/slab.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 #include <linux/string.h>
 #include <asm/io.h>
 #include <asm-generic/pci-bridge.h>
diff --git a/arch/mn10300/include/asm/Kbuild b/arch/mn10300/include/asm/Kbuild
index f892d9de47d926..de30b0c887968c 100644
--- a/arch/mn10300/include/asm/Kbuild
+++ b/arch/mn10300/include/asm/Kbuild
@@ -6,6 +6,5 @@ generic-y += exec.h
 generic-y += irq_work.h
 generic-y += mcs_spinlock.h
 generic-y += preempt.h
-generic-y += scatterlist.h
 generic-y += sections.h
 generic-y += trace_clock.h
diff --git a/arch/mn10300/include/asm/pci.h b/arch/mn10300/include/asm/pci.h
index 5f70af25c7d02a..393b51d38f3618 100644
--- a/arch/mn10300/include/asm/pci.h
+++ b/arch/mn10300/include/asm/pci.h
@@ -55,7 +55,7 @@ void pcibios_set_master(struct pci_dev *dev);
 
 #include <linux/types.h>
 #include <linux/slab.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 #include <linux/string.h>
 #include <asm/io.h>
 
diff --git a/arch/nios2/include/asm/Kbuild b/arch/nios2/include/asm/Kbuild
index 24b3d8999ac709..434639d510b3cb 100644
--- a/arch/nios2/include/asm/Kbuild
+++ b/arch/nios2/include/asm/Kbuild
@@ -40,7 +40,6 @@ generic-y += poll.h
 generic-y += posix_types.h
 generic-y += preempt.h
 generic-y += resource.h
-generic-y += scatterlist.h
 generic-y += sections.h
 generic-y += segment.h
 generic-y += sembuf.h
diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
index 91f1f360a7c4da..2a2e39b8109abd 100644
--- a/arch/openrisc/include/asm/Kbuild
+++ b/arch/openrisc/include/asm/Kbuild
@@ -45,7 +45,6 @@ generic-y += poll.h
 generic-y += posix_types.h
 generic-y += preempt.h
 generic-y += resource.h
-generic-y += scatterlist.h
 generic-y += sections.h
 generic-y += segment.h
 generic-y += sembuf.h
diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild
index 7a4bcc36303d08..12b341d04f88d0 100644
--- a/arch/parisc/include/asm/Kbuild
+++ b/arch/parisc/include/asm/Kbuild
@@ -20,7 +20,6 @@ generic-y += param.h
 generic-y += percpu.h
 generic-y += poll.h
 generic-y += preempt.h
-generic-y += scatterlist.h
 generic-y += seccomp.h
 generic-y += segment.h
 generic-y += topology.h
diff --git a/arch/parisc/include/asm/dma-mapping.h b/arch/parisc/include/asm/dma-mapping.h
index d0eae5f2bd8795..d8d60a57183fb3 100644
--- a/arch/parisc/include/asm/dma-mapping.h
+++ b/arch/parisc/include/asm/dma-mapping.h
@@ -2,8 +2,8 @@
 #define _PARISC_DMA_MAPPING_H
 
 #include <linux/mm.h>
+#include <linux/scatterlist.h>
 #include <asm/cacheflush.h>
-#include <asm/scatterlist.h>
 
 /* See Documentation/DMA-API-HOWTO.txt */
 struct hppa_dma_ops {
diff --git a/arch/parisc/include/asm/pci.h b/arch/parisc/include/asm/pci.h
index 20df2b04fc09f4..794d8820db5d77 100644
--- a/arch/parisc/include/asm/pci.h
+++ b/arch/parisc/include/asm/pci.h
@@ -1,7 +1,7 @@
 #ifndef __ASM_PARISC_PCI_H
 #define __ASM_PARISC_PCI_H
 
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 
 
 
diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
index 4b87205c230ca6..050712e1ce4126 100644
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -6,6 +6,5 @@ generic-y += local64.h
 generic-y += mcs_spinlock.h
 generic-y += preempt.h
 generic-y += rwsem.h
-generic-y += scatterlist.h
 generic-y += trace_clock.h
 generic-y += vtime.h
diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
index 4aef8d6609997f..3be43ab63181c2 100644
--- a/arch/powerpc/include/asm/pci.h
+++ b/arch/powerpc/include/asm/pci.h
@@ -13,9 +13,9 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/dma-mapping.h>
+#include <linux/scatterlist.h>
 
 #include <asm/machdep.h>
-#include <asm/scatterlist.h>
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/pci-bridge.h>
diff --git a/arch/powerpc/include/asm/vio.h b/arch/powerpc/include/asm/vio.h
index 4f9b7ca0710fc5..84286ec77b124c 100644
--- a/arch/powerpc/include/asm/vio.h
+++ b/arch/powerpc/include/asm/vio.h
@@ -19,9 +19,9 @@
 #include <linux/device.h>
 #include <linux/dma-mapping.h>
 #include <linux/mod_devicetable.h>
+#include <linux/scatterlist.h>
 
 #include <asm/hvcall.h>
-#include <asm/scatterlist.h>
 
 /*
  * Architecture-specific constants for drivers to
diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild
index c631f98fd5242b..dc5385ebb071ac 100644
--- a/arch/s390/include/asm/Kbuild
+++ b/arch/s390/include/asm/Kbuild
@@ -4,5 +4,4 @@ generic-y += clkdev.h
 generic-y += irq_work.h
 generic-y += mcs_spinlock.h
 generic-y += preempt.h
-generic-y += scatterlist.h
 generic-y += trace_clock.h
diff --git a/arch/score/include/asm/Kbuild b/arch/score/include/asm/Kbuild
index 83ed116d414c1a..138fb3db45ba66 100644
--- a/arch/score/include/asm/Kbuild
+++ b/arch/score/include/asm/Kbuild
@@ -8,7 +8,6 @@ generic-y += cputime.h
 generic-y += irq_work.h
 generic-y += mcs_spinlock.h
 generic-y += preempt.h
-generic-y += scatterlist.h
 generic-y += sections.h
 generic-y += trace_clock.h
 generic-y += xor.h
diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild
index 654ebb6bd5d8da..9ac4626e72844b 100644
--- a/arch/sh/include/asm/Kbuild
+++ b/arch/sh/include/asm/Kbuild
@@ -24,7 +24,6 @@ generic-y += percpu.h
 generic-y += poll.h
 generic-y += preempt.h
 generic-y += resource.h
-generic-y += scatterlist.h
 generic-y += sembuf.h
 generic-y += serial.h
 generic-y += shmbuf.h
diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild
index 94f36e7086a769..2b2a69dcc467cb 100644
--- a/arch/sparc/include/asm/Kbuild
+++ b/arch/sparc/include/asm/Kbuild
@@ -15,7 +15,6 @@ generic-y += mcs_spinlock.h
 generic-y += module.h
 generic-y += mutex.h
 generic-y += preempt.h
-generic-y += scatterlist.h
 generic-y += serial.h
 generic-y += trace_clock.h
 generic-y += types.h
diff --git a/arch/sparc/kernel/iommu_common.h b/arch/sparc/kernel/iommu_common.h
index f4be0d724fc6e6..b40cec25290503 100644
--- a/arch/sparc/kernel/iommu_common.h
+++ b/arch/sparc/kernel/iommu_common.h
@@ -13,9 +13,9 @@
 #include <linux/scatterlist.h>
 #include <linux/device.h>
 #include <linux/iommu-helper.h>
+#include <linux/scatterlist.h>
 
 #include <asm/iommu.h>
-#include <asm/scatterlist.h>
 
 /*
  * These give mapping size of each iommu pte/tlb.
diff --git a/arch/tile/include/asm/Kbuild b/arch/tile/include/asm/Kbuild
index f5433e0e34e0bc..d53654488c2c15 100644
--- a/arch/tile/include/asm/Kbuild
+++ b/arch/tile/include/asm/Kbuild
@@ -27,7 +27,6 @@ generic-y += poll.h
 generic-y += posix_types.h
 generic-y += preempt.h
 generic-y += resource.h
-generic-y += scatterlist.h
 generic-y += sembuf.h
 generic-y += serial.h
 generic-y += shmbuf.h
diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index ea9ec2e3438401..3d63ff6f583ff4 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -21,7 +21,6 @@ generic-y += param.h
 generic-y += pci.h
 generic-y += percpu.h
 generic-y += preempt.h
-generic-y += scatterlist.h
 generic-y += switch_to.h
 generic-y += topology.h
 generic-y += trace_clock.h
diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild
index 3e0c19d0f4c5d9..d12b377b5a8b6b 100644
--- a/arch/unicore32/include/asm/Kbuild
+++ b/arch/unicore32/include/asm/Kbuild
@@ -36,7 +36,6 @@ generic-y += poll.h
 generic-y += posix_types.h
 generic-y += preempt.h
 generic-y += resource.h
-generic-y += scatterlist.h
 generic-y += sections.h
 generic-y += segment.h
 generic-y += sembuf.h
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index d55a210a49bf73..4dd1f2d770af41 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -9,4 +9,3 @@ generic-y += cputime.h
 generic-y += dma-contiguous.h
 generic-y += early_ioremap.h
 generic-y += mcs_spinlock.h
-generic-y += scatterlist.h
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 4e370a5d81170e..81da003df21b08 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -5,7 +5,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/string.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 #include <asm/io.h>
 #include <asm/x86_init.h>
 
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild
index 86a9ab2e2ca92c..14d15bf1a95bfb 100644
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -22,7 +22,6 @@ generic-y += mcs_spinlock.h
 generic-y += percpu.h
 generic-y += preempt.h
 generic-y += resource.h
-generic-y += scatterlist.h
 generic-y += sections.h
 generic-y += siginfo.h
 generic-y += statfs.h
diff --git a/arch/xtensa/include/asm/pci.h b/arch/xtensa/include/asm/pci.h
index 5d52dc43dfe73e..e438a00fbd6399 100644
--- a/arch/xtensa/include/asm/pci.h
+++ b/arch/xtensa/include/asm/pci.h
@@ -33,7 +33,7 @@ extern struct pci_controller* pcibios_alloc_controller(void);
 
 #include <linux/types.h>
 #include <linux/slab.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 #include <linux/string.h>
 #include <asm/io.h>
 
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 5cbd5d9ea61dd5..0436c21db7f23b 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -361,7 +361,7 @@ static void bio_integrity_verify_fn(struct work_struct *work)
 
 	/* Restore original bio completion handler */
 	bio->bi_end_io = bip->bip_end_io;
-	bio_endio_nodec(bio, error);
+	bio_endio(bio, error);
 }
 
 /**
@@ -388,7 +388,7 @@ void bio_integrity_endio(struct bio *bio, int error)
 	 */
 	if (error) {
 		bio->bi_end_io = bip->bip_end_io;
-		bio_endio_nodec(bio, error);
+		bio_endio(bio, error);
 
 		return;
 	}
diff --git a/block/bio.c b/block/bio.c
index f66a4eae16ee4a..2a00d349cd6883 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -270,8 +270,8 @@ void bio_init(struct bio *bio)
 {
 	memset(bio, 0, sizeof(*bio));
 	bio->bi_flags = 1 << BIO_UPTODATE;
-	atomic_set(&bio->bi_remaining, 1);
-	atomic_set(&bio->bi_cnt, 1);
+	atomic_set(&bio->__bi_remaining, 1);
+	atomic_set(&bio->__bi_cnt, 1);
 }
 EXPORT_SYMBOL(bio_init);
 
@@ -292,8 +292,8 @@ void bio_reset(struct bio *bio)
 	__bio_free(bio);
 
 	memset(bio, 0, BIO_RESET_BYTES);
-	bio->bi_flags = flags|(1 << BIO_UPTODATE);
-	atomic_set(&bio->bi_remaining, 1);
+	bio->bi_flags = flags | (1 << BIO_UPTODATE);
+	atomic_set(&bio->__bi_remaining, 1);
 }
 EXPORT_SYMBOL(bio_reset);
 
@@ -303,6 +303,17 @@ static void bio_chain_endio(struct bio *bio, int error)
 	bio_put(bio);
 }
 
+/*
+ * Increment chain count for the bio. Make sure the CHAIN flag update
+ * is visible before the raised count.
+ */
+static inline void bio_inc_remaining(struct bio *bio)
+{
+	bio->bi_flags |= (1 << BIO_CHAIN);
+	smp_mb__before_atomic();
+	atomic_inc(&bio->__bi_remaining);
+}
+
 /**
  * bio_chain - chain bio completions
  * @bio: the target bio
@@ -320,7 +331,7 @@ void bio_chain(struct bio *bio, struct bio *parent)
 
 	bio->bi_private = parent;
 	bio->bi_end_io	= bio_chain_endio;
-	atomic_inc(&parent->bi_remaining);
+	bio_inc_remaining(parent);
 }
 EXPORT_SYMBOL(bio_chain);
 
@@ -524,13 +535,17 @@ EXPORT_SYMBOL(zero_fill_bio);
  **/
 void bio_put(struct bio *bio)
 {
-	BIO_BUG_ON(!atomic_read(&bio->bi_cnt));
-
-	/*
-	 * last put frees it
-	 */
-	if (atomic_dec_and_test(&bio->bi_cnt))
+	if (!bio_flagged(bio, BIO_REFFED))
 		bio_free(bio);
+	else {
+		BIO_BUG_ON(!atomic_read(&bio->__bi_cnt));
+
+		/*
+		 * last put frees it
+		 */
+		if (atomic_dec_and_test(&bio->__bi_cnt))
+			bio_free(bio);
+	}
 }
 EXPORT_SYMBOL(bio_put);
 
@@ -1741,6 +1756,25 @@ void bio_flush_dcache_pages(struct bio *bi)
 EXPORT_SYMBOL(bio_flush_dcache_pages);
 #endif
 
+static inline bool bio_remaining_done(struct bio *bio)
+{
+	/*
+	 * If we're not chaining, then ->__bi_remaining is always 1 and
+	 * we always end io on the first invocation.
+	 */
+	if (!bio_flagged(bio, BIO_CHAIN))
+		return true;
+
+	BUG_ON(atomic_read(&bio->__bi_remaining) <= 0);
+
+	if (atomic_dec_and_test(&bio->__bi_remaining)) {
+		clear_bit(BIO_CHAIN, &bio->bi_flags);
+		return true;
+	}
+
+	return false;
+}
+
 /**
  * bio_endio - end I/O on a bio
  * @bio:	bio
@@ -1758,15 +1792,13 @@ EXPORT_SYMBOL(bio_flush_dcache_pages);
 void bio_endio(struct bio *bio, int error)
 {
 	while (bio) {
-		BUG_ON(atomic_read(&bio->bi_remaining) <= 0);
-
 		if (error)
 			clear_bit(BIO_UPTODATE, &bio->bi_flags);
 		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 			error = -EIO;
 
-		if (!atomic_dec_and_test(&bio->bi_remaining))
-			return;
+		if (unlikely(!bio_remaining_done(bio)))
+			break;
 
 		/*
 		 * Need to have a real endio function for chained bios,
@@ -1790,21 +1822,6 @@ void bio_endio(struct bio *bio, int error)
 EXPORT_SYMBOL(bio_endio);
 
 /**
- * bio_endio_nodec - end I/O on a bio, without decrementing bi_remaining
- * @bio:	bio
- * @error:	error, if any
- *
- * For code that has saved and restored bi_end_io; thing hard before using this
- * function, probably you should've cloned the entire bio.
- **/
-void bio_endio_nodec(struct bio *bio, int error)
-{
-	atomic_inc(&bio->bi_remaining);
-	bio_endio(bio, error);
-}
-EXPORT_SYMBOL(bio_endio_nodec);
-
-/**
  * bio_split - split a bio
  * @bio:	bio to split
  * @sectors:	number of sectors to split from the front of @bio
@@ -1971,6 +1988,28 @@ struct bio_set *bioset_create_nobvec(unsigned int pool_size, unsigned int front_
 EXPORT_SYMBOL(bioset_create_nobvec);
 
 #ifdef CONFIG_BLK_CGROUP
+
+/**
+ * bio_associate_blkcg - associate a bio with the specified blkcg
+ * @bio: target bio
+ * @blkcg_css: css of the blkcg to associate
+ *
+ * Associate @bio with the blkcg specified by @blkcg_css.  Block layer will
+ * treat @bio as if it were issued by a task which belongs to the blkcg.
+ *
+ * This function takes an extra reference of @blkcg_css which will be put
+ * when @bio is released.  The caller must own @bio and is responsible for
+ * synchronizing calls to this function.
+ */
+int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css)
+{
+	if (unlikely(bio->bi_css))
+		return -EBUSY;
+	css_get(blkcg_css);
+	bio->bi_css = blkcg_css;
+	return 0;
+}
+
 /**
  * bio_associate_current - associate a bio with %current
  * @bio: target bio
@@ -1987,26 +2026,17 @@ EXPORT_SYMBOL(bioset_create_nobvec);
 int bio_associate_current(struct bio *bio)
 {
 	struct io_context *ioc;
-	struct cgroup_subsys_state *css;
 
-	if (bio->bi_ioc)
+	if (bio->bi_css)
 		return -EBUSY;
 
 	ioc = current->io_context;
 	if (!ioc)
 		return -ENOENT;
 
-	/* acquire active ref on @ioc and associate */
 	get_io_context_active(ioc);
 	bio->bi_ioc = ioc;
-
-	/* associate blkcg if exists */
-	rcu_read_lock();
-	css = task_css(current, blkio_cgrp_id);
-	if (css && css_tryget_online(css))
-		bio->bi_css = css;
-	rcu_read_unlock();
-
+	bio->bi_css = task_get_css(current, blkio_cgrp_id);
 	return 0;
 }
 
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0ac817b750dbc7..31610ae0ebff2b 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -15,11 +15,12 @@
 #include <linux/module.h>
 #include <linux/err.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/slab.h>
 #include <linux/genhd.h>
 #include <linux/delay.h>
 #include <linux/atomic.h>
-#include "blk-cgroup.h"
+#include <linux/blk-cgroup.h>
 #include "blk.h"
 
 #define MAX_KEY_LEN 100
@@ -30,6 +31,8 @@ struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT,
 			    .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, };
 EXPORT_SYMBOL_GPL(blkcg_root);
 
+struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
+
 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
 
 static bool blkcg_policy_enabled(struct request_queue *q,
@@ -179,6 +182,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 				    struct blkcg_gq *new_blkg)
 {
 	struct blkcg_gq *blkg;
+	struct bdi_writeback_congested *wb_congested;
 	int i, ret;
 
 	WARN_ON_ONCE(!rcu_read_lock_held());
@@ -190,22 +194,30 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 		goto err_free_blkg;
 	}
 
+	wb_congested = wb_congested_get_create(&q->backing_dev_info,
+					       blkcg->css.id, GFP_ATOMIC);
+	if (!wb_congested) {
+		ret = -ENOMEM;
+		goto err_put_css;
+	}
+
 	/* allocate */
 	if (!new_blkg) {
 		new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC);
 		if (unlikely(!new_blkg)) {
 			ret = -ENOMEM;
-			goto err_put_css;
+			goto err_put_congested;
 		}
 	}
 	blkg = new_blkg;
+	blkg->wb_congested = wb_congested;
 
 	/* link parent */
 	if (blkcg_parent(blkcg)) {
 		blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
 		if (WARN_ON_ONCE(!blkg->parent)) {
 			ret = -EINVAL;
-			goto err_put_css;
+			goto err_put_congested;
 		}
 		blkg_get(blkg->parent);
 	}
@@ -235,18 +247,15 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 	blkg->online = true;
 	spin_unlock(&blkcg->lock);
 
-	if (!ret) {
-		if (blkcg == &blkcg_root) {
-			q->root_blkg = blkg;
-			q->root_rl.blkg = blkg;
-		}
+	if (!ret)
 		return blkg;
-	}
 
 	/* @blkg failed fully initialized, use the usual release path */
 	blkg_put(blkg);
 	return ERR_PTR(ret);
 
+err_put_congested:
+	wb_congested_put(wb_congested);
 err_put_css:
 	css_put(&blkcg->css);
 err_free_blkg:
@@ -340,15 +349,6 @@ static void blkg_destroy(struct blkcg_gq *blkg)
 		rcu_assign_pointer(blkcg->blkg_hint, NULL);
 
 	/*
-	 * If root blkg is destroyed.  Just clear the pointer since root_rl
-	 * does not take reference on root blkg.
-	 */
-	if (blkcg == &blkcg_root) {
-		blkg->q->root_blkg = NULL;
-		blkg->q->root_rl.blkg = NULL;
-	}
-
-	/*
 	 * Put the reference taken at the time of creation so that when all
 	 * queues are gone, group can be destroyed.
 	 */
@@ -402,6 +402,8 @@ void __blkg_release_rcu(struct rcu_head *rcu_head)
 	if (blkg->parent)
 		blkg_put(blkg->parent);
 
+	wb_congested_put(blkg->wb_congested);
+
 	blkg_free(blkg);
 }
 EXPORT_SYMBOL_GPL(__blkg_release_rcu);
@@ -809,6 +811,8 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css)
 	}
 
 	spin_unlock_irq(&blkcg->lock);
+
+	wb_blkcg_offline(blkcg);
 }
 
 static void blkcg_css_free(struct cgroup_subsys_state *css)
@@ -839,7 +843,9 @@ done:
 	spin_lock_init(&blkcg->lock);
 	INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
 	INIT_HLIST_HEAD(&blkcg->blkg_list);
-
+#ifdef CONFIG_CGROUP_WRITEBACK
+	INIT_LIST_HEAD(&blkcg->cgwb_list);
+#endif
 	return &blkcg->css;
 }
 
@@ -855,9 +861,45 @@ done:
  */
 int blkcg_init_queue(struct request_queue *q)
 {
-	might_sleep();
+	struct blkcg_gq *new_blkg, *blkg;
+	bool preloaded;
+	int ret;
+
+	new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
+	if (!new_blkg)
+		return -ENOMEM;
+
+	preloaded = !radix_tree_preload(GFP_KERNEL);
+
+	/*
+	 * Make sure the root blkg exists and count the existing blkgs.  As
+	 * @q is bypassing at this point, blkg_lookup_create() can't be
+	 * used.  Open code insertion.
+	 */
+	rcu_read_lock();
+	spin_lock_irq(q->queue_lock);
+	blkg = blkg_create(&blkcg_root, q, new_blkg);
+	spin_unlock_irq(q->queue_lock);
+	rcu_read_unlock();
+
+	if (preloaded)
+		radix_tree_preload_end();
+
+	if (IS_ERR(blkg)) {
+		kfree(new_blkg);
+		return PTR_ERR(blkg);
+	}
 
-	return blk_throtl_init(q);
+	q->root_blkg = blkg;
+	q->root_rl.blkg = blkg;
+
+	ret = blk_throtl_init(q);
+	if (ret) {
+		spin_lock_irq(q->queue_lock);
+		blkg_destroy_all(q);
+		spin_unlock_irq(q->queue_lock);
+	}
+	return ret;
 }
 
 /**
@@ -958,52 +1000,20 @@ int blkcg_activate_policy(struct request_queue *q,
 			  const struct blkcg_policy *pol)
 {
 	LIST_HEAD(pds);
-	struct blkcg_gq *blkg, *new_blkg;
+	struct blkcg_gq *blkg;
 	struct blkg_policy_data *pd, *n;
 	int cnt = 0, ret;
-	bool preloaded;
 
 	if (blkcg_policy_enabled(q, pol))
 		return 0;
 
-	/* preallocations for root blkg */
-	new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
-	if (!new_blkg)
-		return -ENOMEM;
-
+	/* count and allocate policy_data for all existing blkgs */
 	blk_queue_bypass_start(q);
-
-	preloaded = !radix_tree_preload(GFP_KERNEL);
-
-	/*
-	 * Make sure the root blkg exists and count the existing blkgs.  As
-	 * @q is bypassing at this point, blkg_lookup_create() can't be
-	 * used.  Open code it.
-	 */
 	spin_lock_irq(q->queue_lock);
-
-	rcu_read_lock();
-	blkg = __blkg_lookup(&blkcg_root, q, false);
-	if (blkg)
-		blkg_free(new_blkg);
-	else
-		blkg = blkg_create(&blkcg_root, q, new_blkg);
-	rcu_read_unlock();
-
-	if (preloaded)
-		radix_tree_preload_end();
-
-	if (IS_ERR(blkg)) {
-		ret = PTR_ERR(blkg);
-		goto out_unlock;
-	}
-
 	list_for_each_entry(blkg, &q->blkg_list, q_node)
 		cnt++;
-
 	spin_unlock_irq(q->queue_lock);
 
-	/* allocate policy_data for all existing blkgs */
 	while (cnt--) {
 		pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
 		if (!pd) {
@@ -1072,10 +1082,6 @@ void blkcg_deactivate_policy(struct request_queue *q,
 
 	__clear_bit(pol->plid, q->blkcg_pols);
 
-	/* if no policy is left, no need for blkgs - shoot them down */
-	if (bitmap_empty(q->blkcg_pols, BLKCG_MAX_POLS))
-		blkg_destroy_all(q);
-
 	list_for_each_entry(blkg, &q->blkg_list, q_node) {
 		/* grab blkcg lock too while removing @pd from @blkg */
 		spin_lock(&blkg->blkcg->lock);
diff --git a/block/blk-core.c b/block/blk-core.c
index 03b5f8d77f37b4..688ae9482cb8ea 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -32,12 +32,12 @@
 #include <linux/delay.h>
 #include <linux/ratelimit.h>
 #include <linux/pm_runtime.h>
+#include <linux/blk-cgroup.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/block.h>
 
 #include "blk.h"
-#include "blk-cgroup.h"
 #include "blk-mq.h"
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
@@ -63,6 +63,31 @@ struct kmem_cache *blk_requestq_cachep;
  */
 static struct workqueue_struct *kblockd_workqueue;
 
+static void blk_clear_congested(struct request_list *rl, int sync)
+{
+#ifdef CONFIG_CGROUP_WRITEBACK
+	clear_wb_congested(rl->blkg->wb_congested, sync);
+#else
+	/*
+	 * If !CGROUP_WRITEBACK, all blkg's map to bdi->wb and we shouldn't
+	 * flip its congestion state for events on other blkcgs.
+	 */
+	if (rl == &rl->q->root_rl)
+		clear_wb_congested(rl->q->backing_dev_info.wb.congested, sync);
+#endif
+}
+
+static void blk_set_congested(struct request_list *rl, int sync)
+{
+#ifdef CONFIG_CGROUP_WRITEBACK
+	set_wb_congested(rl->blkg->wb_congested, sync);
+#else
+	/* see blk_clear_congested() */
+	if (rl == &rl->q->root_rl)
+		set_wb_congested(rl->q->backing_dev_info.wb.congested, sync);
+#endif
+}
+
 void blk_queue_congestion_threshold(struct request_queue *q)
 {
 	int nr;
@@ -117,7 +142,7 @@ EXPORT_SYMBOL(blk_rq_init);
 static void req_bio_endio(struct request *rq, struct bio *bio,
 			  unsigned int nbytes, int error)
 {
-	if (error)
+	if (error && !(rq->cmd_flags & REQ_CLONE))
 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
 	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 		error = -EIO;
@@ -128,7 +153,8 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 	bio_advance(bio, nbytes);
 
 	/* don't actually finish bio if it's part of flush sequence */
-	if (bio->bi_iter.bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
+	if (bio->bi_iter.bi_size == 0 &&
+	    !(rq->cmd_flags & (REQ_FLUSH_SEQ|REQ_CLONE)))
 		bio_endio(bio, error);
 }
 
@@ -285,6 +311,7 @@ inline void __blk_run_queue_uncond(struct request_queue *q)
 	q->request_fn(q);
 	q->request_fn_active--;
 }
+EXPORT_SYMBOL_GPL(__blk_run_queue_uncond);
 
 /**
  * __blk_run_queue - run a single device queue
@@ -621,8 +648,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 
 	q->backing_dev_info.ra_pages =
 			(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
-	q->backing_dev_info.state = 0;
-	q->backing_dev_info.capabilities = 0;
+	q->backing_dev_info.capabilities = BDI_CAP_CGROUP_WRITEBACK;
 	q->backing_dev_info.name = "block";
 	q->node = node_id;
 
@@ -845,13 +871,8 @@ static void __freed_request(struct request_list *rl, int sync)
 {
 	struct request_queue *q = rl->q;
 
-	/*
-	 * bdi isn't aware of blkcg yet.  As all async IOs end up root
-	 * blkcg anyway, just use root blkcg state.
-	 */
-	if (rl == &q->root_rl &&
-	    rl->count[sync] < queue_congestion_off_threshold(q))
-		blk_clear_queue_congested(q, sync);
+	if (rl->count[sync] < queue_congestion_off_threshold(q))
+		blk_clear_congested(rl, sync);
 
 	if (rl->count[sync] + 1 <= q->nr_requests) {
 		if (waitqueue_active(&rl->wait[sync]))
@@ -884,25 +905,25 @@ static void freed_request(struct request_list *rl, unsigned int flags)
 int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
 {
 	struct request_list *rl;
+	int on_thresh, off_thresh;
 
 	spin_lock_irq(q->queue_lock);
 	q->nr_requests = nr;
 	blk_queue_congestion_threshold(q);
+	on_thresh = queue_congestion_on_threshold(q);
+	off_thresh = queue_congestion_off_threshold(q);
 
-	/* congestion isn't cgroup aware and follows root blkcg for now */
-	rl = &q->root_rl;
-
-	if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
-		blk_set_queue_congested(q, BLK_RW_SYNC);
-	else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
-		blk_clear_queue_congested(q, BLK_RW_SYNC);
+	blk_queue_for_each_rl(rl, q) {
+		if (rl->count[BLK_RW_SYNC] >= on_thresh)
+			blk_set_congested(rl, BLK_RW_SYNC);
+		else if (rl->count[BLK_RW_SYNC] < off_thresh)
+			blk_clear_congested(rl, BLK_RW_SYNC);
 
-	if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
-		blk_set_queue_congested(q, BLK_RW_ASYNC);
-	else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
-		blk_clear_queue_congested(q, BLK_RW_ASYNC);
+		if (rl->count[BLK_RW_ASYNC] >= on_thresh)
+			blk_set_congested(rl, BLK_RW_ASYNC);
+		else if (rl->count[BLK_RW_ASYNC] < off_thresh)
+			blk_clear_congested(rl, BLK_RW_ASYNC);
 
-	blk_queue_for_each_rl(rl, q) {
 		if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
 			blk_set_rl_full(rl, BLK_RW_SYNC);
 		} else {
@@ -1012,12 +1033,7 @@ static struct request *__get_request(struct request_list *rl, int rw_flags,
 				}
 			}
 		}
-		/*
-		 * bdi isn't aware of blkcg yet.  As all async IOs end up
-		 * root blkcg anyway, just use root blkcg state.
-		 */
-		if (rl == &q->root_rl)
-			blk_set_queue_congested(q, is_sync);
+		blk_set_congested(rl, is_sync);
 	}
 
 	/*
@@ -1525,7 +1541,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
  * Caller must ensure !blk_queue_nomerges(q) beforehand.
  */
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
-			    unsigned int *request_count)
+			    unsigned int *request_count,
+			    struct request **same_queue_rq)
 {
 	struct blk_plug *plug;
 	struct request *rq;
@@ -1545,8 +1562,16 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
 	list_for_each_entry_reverse(rq, plug_list, queuelist) {
 		int el_ret;
 
-		if (rq->q == q)
+		if (rq->q == q) {
 			(*request_count)++;
+			/*
+			 * Only blk-mq multiple hardware queues case checks the
+			 * rq in the same queue, there should be only one such
+			 * rq in a queue
+			 **/
+			if (same_queue_rq)
+				*same_queue_rq = rq;
+		}
 
 		if (rq->q != q || !blk_rq_merge_ok(rq, bio))
 			continue;
@@ -1611,7 +1636,7 @@ static void blk_queue_bio(struct request_queue *q, struct bio *bio)
 	 * any locks.
 	 */
 	if (!blk_queue_nomerges(q) &&
-	    blk_attempt_plug_merge(q, bio, &request_count))
+	    blk_attempt_plug_merge(q, bio, &request_count, NULL))
 		return;
 
 	spin_lock_irq(q->queue_lock);
@@ -1718,8 +1743,6 @@ static void handle_bad_sector(struct bio *bio)
 			bio->bi_rw,
 			(unsigned long long)bio_end_sector(bio),
 			(long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9));
-
-	set_bit(BIO_EOF, &bio->bi_flags);
 }
 
 #ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -2904,95 +2927,22 @@ int blk_lld_busy(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_lld_busy);
 
-/**
- * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
- * @rq: the clone request to be cleaned up
- *
- * Description:
- *     Free all bios in @rq for a cloned request.
- */
-void blk_rq_unprep_clone(struct request *rq)
-{
-	struct bio *bio;
-
-	while ((bio = rq->bio) != NULL) {
-		rq->bio = bio->bi_next;
-
-		bio_put(bio);
-	}
-}
-EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
-
-/*
- * Copy attributes of the original request to the clone request.
- * The actual data parts (e.g. ->cmd, ->sense) are not copied.
- */
-static void __blk_rq_prep_clone(struct request *dst, struct request *src)
+void blk_rq_prep_clone(struct request *dst, struct request *src)
 {
 	dst->cpu = src->cpu;
-	dst->cmd_flags |= (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE;
+	dst->cmd_flags |= (src->cmd_flags & REQ_CLONE_MASK);
+	dst->cmd_flags |= REQ_NOMERGE | REQ_CLONE;
 	dst->cmd_type = src->cmd_type;
 	dst->__sector = blk_rq_pos(src);
 	dst->__data_len = blk_rq_bytes(src);
 	dst->nr_phys_segments = src->nr_phys_segments;
 	dst->ioprio = src->ioprio;
 	dst->extra_len = src->extra_len;
-}
-
-/**
- * blk_rq_prep_clone - Helper function to setup clone request
- * @rq: the request to be setup
- * @rq_src: original request to be cloned
- * @bs: bio_set that bios for clone are allocated from
- * @gfp_mask: memory allocation mask for bio
- * @bio_ctr: setup function to be called for each clone bio.
- *           Returns %0 for success, non %0 for failure.
- * @data: private data to be passed to @bio_ctr
- *
- * Description:
- *     Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
- *     The actual data parts of @rq_src (e.g. ->cmd, ->sense)
- *     are not copied, and copying such parts is the caller's responsibility.
- *     Also, pages which the original bios are pointing to are not copied
- *     and the cloned bios just point same pages.
- *     So cloned bios must be completed before original bios, which means
- *     the caller must complete @rq before @rq_src.
- */
-int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
-		      struct bio_set *bs, gfp_t gfp_mask,
-		      int (*bio_ctr)(struct bio *, struct bio *, void *),
-		      void *data)
-{
-	struct bio *bio, *bio_src;
-
-	if (!bs)
-		bs = fs_bio_set;
-
-	__rq_for_each_bio(bio_src, rq_src) {
-		bio = bio_clone_fast(bio_src, gfp_mask, bs);
-		if (!bio)
-			goto free_and_out;
-
-		if (bio_ctr && bio_ctr(bio, bio_src, data))
-			goto free_and_out;
-
-		if (rq->bio) {
-			rq->biotail->bi_next = bio;
-			rq->biotail = bio;
-		} else
-			rq->bio = rq->biotail = bio;
-	}
-
-	__blk_rq_prep_clone(rq, rq_src);
-
-	return 0;
-
-free_and_out:
-	if (bio)
-		bio_put(bio);
-	blk_rq_unprep_clone(rq);
-
-	return -ENOMEM;
+	dst->bio = src->bio;
+	dst->biotail = src->biotail;
+	dst->cmd = src->cmd;
+	dst->cmd_len = src->cmd_len;
+	dst->sense = src->sense;
 }
 EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
 
@@ -3034,21 +2984,20 @@ void blk_start_plug(struct blk_plug *plug)
 {
 	struct task_struct *tsk = current;
 
+	/*
+	 * If this is a nested plug, don't actually assign it.
+	 */
+	if (tsk->plug)
+		return;
+
 	INIT_LIST_HEAD(&plug->list);
 	INIT_LIST_HEAD(&plug->mq_list);
 	INIT_LIST_HEAD(&plug->cb_list);
-
 	/*
-	 * If this is a nested plug, don't actually assign it. It will be
-	 * flushed on its own.
+	 * Store ordering should not be needed here, since a potential
+	 * preempt will imply a full memory barrier
 	 */
-	if (!tsk->plug) {
-		/*
-		 * Store ordering should not be needed here, since a potential
-		 * preempt will imply a full memory barrier
-		 */
-		tsk->plug = plug;
-	}
+	tsk->plug = plug;
 }
 EXPORT_SYMBOL(blk_start_plug);
 
@@ -3195,10 +3144,11 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 
 void blk_finish_plug(struct blk_plug *plug)
 {
+	if (plug != current->plug)
+		return;
 	blk_flush_plug_list(plug, false);
 
-	if (plug == current->plug)
-		current->plug = NULL;
+	current->plug = NULL;
 }
 EXPORT_SYMBOL(blk_finish_plug);
 
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 9924725fa50dca..3fec8a29d0fae1 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -53,7 +53,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 			   rq_end_io_fn *done)
 {
 	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
-	bool is_pm_resume;
 
 	WARN_ON(irqs_disabled());
 	WARN_ON(rq->cmd_type == REQ_TYPE_FS);
@@ -70,12 +69,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 		return;
 	}
 
-	/*
-	 * need to check this before __blk_run_queue(), because rq can
-	 * be freed before that returns.
-	 */
-	is_pm_resume = rq->cmd_type == REQ_TYPE_PM_RESUME;
-
 	spin_lock_irq(q->queue_lock);
 
 	if (unlikely(blk_queue_dying(q))) {
@@ -88,9 +81,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 
 	__elv_add_request(q, rq, where);
 	__blk_run_queue(q);
-	/* the queue is stopped so it won't be run */
-	if (is_pm_resume)
-		__blk_run_queue_uncond(q);
 	spin_unlock_irq(q->queue_lock);
 }
 EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 79ffb4855af048..f548b64be09242 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -21,6 +21,7 @@
  */
 
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/mempool.h>
 #include <linux/bio.h>
 #include <linux/scatterlist.h>
diff --git a/block/blk-merge.c b/block/blk-merge.c
index fd3fee81c23ce2..30a0d9f890170b 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -589,7 +589,8 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 	    !blk_write_same_mergeable(rq->bio, bio))
 		return false;
 
-	if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS)) {
+	/* Only check gaps if the bio carries data */
+	if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS) && bio_has_data(bio)) {
 		struct bio_vec *bprev;
 
 		bprev = &rq->biotail->bi_io_vec[rq->biotail->bi_vcnt - 1];
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index be3290cc0644ef..9b6e28830b8238 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -438,6 +438,39 @@ static void bt_for_each(struct blk_mq_hw_ctx *hctx,
 	}
 }
 
+static void bt_tags_for_each(struct blk_mq_tags *tags,
+		struct blk_mq_bitmap_tags *bt, unsigned int off,
+		busy_tag_iter_fn *fn, void *data, bool reserved)
+{
+	struct request *rq;
+	int bit, i;
+
+	if (!tags->rqs)
+		return;
+	for (i = 0; i < bt->map_nr; i++) {
+		struct blk_align_bitmap *bm = &bt->map[i];
+
+		for (bit = find_first_bit(&bm->word, bm->depth);
+		     bit < bm->depth;
+		     bit = find_next_bit(&bm->word, bm->depth, bit + 1)) {
+			rq = blk_mq_tag_to_rq(tags, off + bit);
+			fn(rq, data, reserved);
+		}
+
+		off += (1 << bt->bits_per_word);
+	}
+}
+
+void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
+		void *priv)
+{
+	if (tags->nr_reserved_tags)
+		bt_tags_for_each(tags, &tags->breserved_tags, 0, fn, priv, true);
+	bt_tags_for_each(tags, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv,
+			false);
+}
+EXPORT_SYMBOL(blk_mq_all_tag_busy_iter);
+
 void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn,
 		void *priv)
 {
@@ -580,6 +613,11 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
 	if (!tags)
 		return NULL;
 
+	if (!zalloc_cpumask_var(&tags->cpumask, GFP_KERNEL)) {
+		kfree(tags);
+		return NULL;
+	}
+
 	tags->nr_tags = total_tags;
 	tags->nr_reserved_tags = reserved_tags;
 
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 90767b370308da..75893a34237d2e 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -44,6 +44,7 @@ struct blk_mq_tags {
 	struct list_head page_list;
 
 	int alloc_policy;
+	cpumask_var_t cpumask;
 };
 
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index e68b71b85a7eaf..cd619fe8bc26c8 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -89,7 +89,8 @@ static int blk_mq_queue_enter(struct request_queue *q, gfp_t gfp)
 			return -EBUSY;
 
 		ret = wait_event_interruptible(q->mq_freeze_wq,
-				!q->mq_freeze_depth || blk_queue_dying(q));
+				!atomic_read(&q->mq_freeze_depth) ||
+				blk_queue_dying(q));
 		if (blk_queue_dying(q))
 			return -ENODEV;
 		if (ret)
@@ -112,13 +113,10 @@ static void blk_mq_usage_counter_release(struct percpu_ref *ref)
 
 void blk_mq_freeze_queue_start(struct request_queue *q)
 {
-	bool freeze;
+	int freeze_depth;
 
-	spin_lock_irq(q->queue_lock);
-	freeze = !q->mq_freeze_depth++;
-	spin_unlock_irq(q->queue_lock);
-
-	if (freeze) {
+	freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
+	if (freeze_depth == 1) {
 		percpu_ref_kill(&q->mq_usage_counter);
 		blk_mq_run_hw_queues(q, false);
 	}
@@ -143,13 +141,11 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
 
 void blk_mq_unfreeze_queue(struct request_queue *q)
 {
-	bool wake;
+	int freeze_depth;
 
-	spin_lock_irq(q->queue_lock);
-	wake = !--q->mq_freeze_depth;
-	WARN_ON_ONCE(q->mq_freeze_depth < 0);
-	spin_unlock_irq(q->queue_lock);
-	if (wake) {
+	freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
+	WARN_ON_ONCE(freeze_depth < 0);
+	if (!freeze_depth) {
 		percpu_ref_reinit(&q->mq_usage_counter);
 		wake_up_all(&q->mq_freeze_wq);
 	}
@@ -1237,6 +1233,38 @@ static struct request *blk_mq_map_request(struct request_queue *q,
 	return rq;
 }
 
+static int blk_mq_direct_issue_request(struct request *rq)
+{
+	int ret;
+	struct request_queue *q = rq->q;
+	struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q,
+			rq->mq_ctx->cpu);
+	struct blk_mq_queue_data bd = {
+		.rq = rq,
+		.list = NULL,
+		.last = 1
+	};
+
+	/*
+	 * For OK queue, we are done. For error, kill it. Any other
+	 * error (busy), just add it to our list as we previously
+	 * would have done
+	 */
+	ret = q->mq_ops->queue_rq(hctx, &bd);
+	if (ret == BLK_MQ_RQ_QUEUE_OK)
+		return 0;
+	else {
+		__blk_mq_requeue_request(rq);
+
+		if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
+			rq->errors = -EIO;
+			blk_mq_end_request(rq, rq->errors);
+			return 0;
+		}
+		return -1;
+	}
+}
+
 /*
  * Multiple hardware queue variant. This will not use per-process plugs,
  * but will attempt to bypass the hctx queueing if we can go straight to
@@ -1248,6 +1276,9 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
 	struct blk_map_ctx data;
 	struct request *rq;
+	unsigned int request_count = 0;
+	struct blk_plug *plug;
+	struct request *same_queue_rq = NULL;
 
 	blk_queue_bounce(q, &bio);
 
@@ -1256,6 +1287,10 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		return;
 	}
 
+	if (!is_flush_fua && !blk_queue_nomerges(q) &&
+	    blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
+		return;
+
 	rq = blk_mq_map_request(q, bio, &data);
 	if (unlikely(!rq))
 		return;
@@ -1266,38 +1301,42 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		goto run_queue;
 	}
 
+	plug = current->plug;
 	/*
 	 * If the driver supports defer issued based on 'last', then
 	 * queue it up like normal since we can potentially save some
 	 * CPU this way.
 	 */
-	if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
-		struct blk_mq_queue_data bd = {
-			.rq = rq,
-			.list = NULL,
-			.last = 1
-		};
-		int ret;
+	if (((plug && !blk_queue_nomerges(q)) || is_sync) &&
+	    !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
+		struct request *old_rq = NULL;
 
 		blk_mq_bio_to_request(rq, bio);
 
 		/*
-		 * For OK queue, we are done. For error, kill it. Any other
-		 * error (busy), just add it to our list as we previously
-		 * would have done
+		 * we do limited pluging. If bio can be merged, do merge.
+		 * Otherwise the existing request in the plug list will be
+		 * issued. So the plug list will have one request at most
 		 */
-		ret = q->mq_ops->queue_rq(data.hctx, &bd);
-		if (ret == BLK_MQ_RQ_QUEUE_OK)
-			goto done;
-		else {
-			__blk_mq_requeue_request(rq);
-
-			if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
-				rq->errors = -EIO;
-				blk_mq_end_request(rq, rq->errors);
-				goto done;
+		if (plug) {
+			/*
+			 * The plug list might get flushed before this. If that
+			 * happens, same_queue_rq is invalid and plug list is empty
+			 **/
+			if (same_queue_rq && !list_empty(&plug->mq_list)) {
+				old_rq = same_queue_rq;
+				list_del_init(&old_rq->queuelist);
 			}
-		}
+			list_add_tail(&rq->queuelist, &plug->mq_list);
+		} else /* is_sync */
+			old_rq = rq;
+		blk_mq_put_ctx(data.ctx);
+		if (!old_rq)
+			return;
+		if (!blk_mq_direct_issue_request(old_rq))
+			return;
+		blk_mq_insert_request(old_rq, false, true, true);
+		return;
 	}
 
 	if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
@@ -1310,7 +1349,6 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 run_queue:
 		blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
 	}
-done:
 	blk_mq_put_ctx(data.ctx);
 }
 
@@ -1322,16 +1360,11 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
 {
 	const int is_sync = rw_is_sync(bio->bi_rw);
 	const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
-	unsigned int use_plug, request_count = 0;
+	struct blk_plug *plug;
+	unsigned int request_count = 0;
 	struct blk_map_ctx data;
 	struct request *rq;
 
-	/*
-	 * If we have multiple hardware queues, just go directly to
-	 * one of those for sync IO.
-	 */
-	use_plug = !is_flush_fua && !is_sync;
-
 	blk_queue_bounce(q, &bio);
 
 	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
@@ -1339,8 +1372,8 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
 		return;
 	}
 
-	if (use_plug && !blk_queue_nomerges(q) &&
-	    blk_attempt_plug_merge(q, bio, &request_count))
+	if (!is_flush_fua && !blk_queue_nomerges(q) &&
+	    blk_attempt_plug_merge(q, bio, &request_count, NULL))
 		return;
 
 	rq = blk_mq_map_request(q, bio, &data);
@@ -1358,21 +1391,18 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
 	 * utilize that to temporarily store requests until the task is
 	 * either done or scheduled away.
 	 */
-	if (use_plug) {
-		struct blk_plug *plug = current->plug;
-
-		if (plug) {
-			blk_mq_bio_to_request(rq, bio);
-			if (list_empty(&plug->mq_list))
-				trace_block_plug(q);
-			else if (request_count >= BLK_MAX_REQUEST_COUNT) {
-				blk_flush_plug_list(plug, false);
-				trace_block_plug(q);
-			}
-			list_add_tail(&rq->queuelist, &plug->mq_list);
-			blk_mq_put_ctx(data.ctx);
-			return;
+	plug = current->plug;
+	if (plug) {
+		blk_mq_bio_to_request(rq, bio);
+		if (list_empty(&plug->mq_list))
+			trace_block_plug(q);
+		else if (request_count >= BLK_MAX_REQUEST_COUNT) {
+			blk_flush_plug_list(plug, false);
+			trace_block_plug(q);
 		}
+		list_add_tail(&rq->queuelist, &plug->mq_list);
+		blk_mq_put_ctx(data.ctx);
+		return;
 	}
 
 	if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
@@ -1508,7 +1538,6 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
 			i++;
 		}
 	}
-
 	return tags;
 
 fail:
@@ -1792,6 +1821,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 
 		hctx = q->mq_ops->map_queue(q, i);
 		cpumask_set_cpu(i, hctx->cpumask);
+		cpumask_set_cpu(i, hctx->tags->cpumask);
 		ctx->index_hw = hctx->nr_ctx;
 		hctx->ctxs[hctx->nr_ctx++] = ctx;
 	}
@@ -2052,7 +2082,7 @@ void blk_mq_free_queue(struct request_queue *q)
 /* Basically redo blk_mq_init_queue with queue frozen */
 static void blk_mq_queue_reinit(struct request_queue *q)
 {
-	WARN_ON_ONCE(!q->mq_freeze_depth);
+	WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
 
 	blk_mq_sysfs_unregister(q);
 
@@ -2169,6 +2199,12 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
 	return 0;
 }
 
+struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags)
+{
+	return tags->cpumask;
+}
+EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask);
+
 /*
  * Alloc a tag set to be associated with one or more request queues.
  * May fail with EINVAL for various error conditions. May adjust the
@@ -2230,8 +2266,10 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
 	int i;
 
 	for (i = 0; i < set->nr_hw_queues; i++) {
-		if (set->tags[i])
+		if (set->tags[i]) {
 			blk_mq_free_rq_map(set, set->tags[i], i);
+			free_cpumask_var(set->tags[i]->cpumask);
+		}
 	}
 
 	kfree(set->tags);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 2b8fd302f677a9..6264b382d4d1ba 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -6,11 +6,12 @@
 #include <linux/module.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/blktrace_api.h>
 #include <linux/blk-mq.h>
+#include <linux/blk-cgroup.h>
 
 #include "blk.h"
-#include "blk-cgroup.h"
 #include "blk-mq.h"
 
 struct queue_sysfs_entry {
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 5b9c6d5c3636ad..b23193518ac7a9 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -9,7 +9,7 @@
 #include <linux/blkdev.h>
 #include <linux/bio.h>
 #include <linux/blktrace_api.h>
-#include "blk-cgroup.h"
+#include <linux/blk-cgroup.h>
 #include "blk.h"
 
 /* Max dispatch from a group in 1 round */
diff --git a/block/blk.h b/block/blk.h
index 43b036185712c2..026d9594142bde 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -78,7 +78,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
 bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
 			    struct bio *bio);
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
-			    unsigned int *request_count);
+			    unsigned int *request_count,
+			    struct request **same_queue_rq);
 
 void blk_account_io_start(struct request *req, bool new_io);
 void blk_account_io_completion(struct request *req, unsigned int bytes);
@@ -193,8 +194,6 @@ int blk_try_merge(struct request *rq, struct bio *bio);
 
 void blk_queue_congestion_threshold(struct request_queue *q);
 
-void __blk_run_queue_uncond(struct request_queue *q);
-
 int blk_dev_init(void);
 
 
diff --git a/block/bounce.c b/block/bounce.c
index ed9dd80671204b..b17311227c1276 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -13,6 +13,7 @@
 #include <linux/pagemap.h>
 #include <linux/mempool.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
@@ -128,9 +129,6 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
 	struct bio_vec *bvec, *org_vec;
 	int i;
 
-	if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
-		set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
-
 	/*
 	 * free up bounce indirect pages used
 	 */
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 5da8e6e9ab4bfd..bc8f4293077369 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -14,8 +14,8 @@
 #include <linux/rbtree.h>
 #include <linux/ioprio.h>
 #include <linux/blktrace_api.h>
+#include <linux/blk-cgroup.h>
 #include "blk.h"
-#include "blk-cgroup.h"
 
 /*
  * tunables
diff --git a/block/elevator.c b/block/elevator.c
index 8985038f398ce5..e321bd55ddfeef 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -35,11 +35,11 @@
 #include <linux/hash.h>
 #include <linux/uaccess.h>
 #include <linux/pm_runtime.h>
+#include <linux/blk-cgroup.h>
 
 #include <trace/events/block.h>
 
 #include "blk.h"
-#include "blk-cgroup.h"
 
 static DEFINE_SPINLOCK(elv_list_lock);
 static LIST_HEAD(elv_list);
diff --git a/block/genhd.c b/block/genhd.c
index 666e11b8398378..edc1dbb6c1c96f 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -8,6 +8,7 @@
 #include <linux/kdev_t.h>
 #include <linux/kernel.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/proc_fs.h>
diff --git a/block/ioctl.c b/block/ioctl.c
index 7d8befde2aca7a..8061eba42887a9 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -150,21 +150,48 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 	}
 }
 
-static int blkdev_reread_part(struct block_device *bdev)
+/*
+ * This is an exported API for the block driver, and will not
+ * acquire bd_mutex. This API should be used in case that
+ * caller has held bd_mutex already.
+ */
+int __blkdev_reread_part(struct block_device *bdev)
 {
 	struct gendisk *disk = bdev->bd_disk;
-	int res;
 
 	if (!disk_part_scan_enabled(disk) || bdev != bdev->bd_contains)
 		return -EINVAL;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
-	if (!mutex_trylock(&bdev->bd_mutex))
-		return -EBUSY;
-	res = rescan_partitions(disk, bdev);
+
+	lockdep_assert_held(&bdev->bd_mutex);
+
+	return rescan_partitions(disk, bdev);
+}
+EXPORT_SYMBOL(__blkdev_reread_part);
+
+/*
+ * This is an exported API for the block driver, and will
+ * try to acquire bd_mutex. If bd_mutex has been held already
+ * in current context, please call __blkdev_reread_part().
+ *
+ * Make sure the held locks in current context aren't required
+ * in open()/close() handler and I/O path for avoiding ABBA deadlock:
+ * - bd_mutex is held before calling block driver's open/close
+ *   handler
+ * - reading partition table may submit I/O to the block device
+ */
+int blkdev_reread_part(struct block_device *bdev)
+{
+	int res;
+
+	mutex_lock(&bdev->bd_mutex);
+	res = __blkdev_reread_part(bdev);
 	mutex_unlock(&bdev->bd_mutex);
+
 	return res;
 }
+EXPORT_SYMBOL(blkdev_reread_part);
 
 static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 			     uint64_t len, int secure)
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index ff20f192b0f67a..0422c47261c3a0 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -139,8 +139,6 @@ static struct board_type products[] = {
 	{0x3214103C, "Smart Array E200i", &SA5_access},
 	{0x3215103C, "Smart Array E200i", &SA5_access},
 	{0x3237103C, "Smart Array E500", &SA5_access},
-	{0x3223103C, "Smart Array P800", &SA5_access},
-	{0x3234103C, "Smart Array P400", &SA5_access},
 	{0x323D103C, "Smart Array P700m", &SA5_access},
 };
 
@@ -574,8 +572,6 @@ static void cciss_procinit(ctlr_info_t *h)
 
 /* List of controllers which cannot be hard reset on kexec with reset_devices */
 static u32 unresettable_controller[] = {
-	0x324a103C, /* Smart Array P712m */
-	0x324b103C, /* SmartArray P711m */
 	0x3223103C, /* Smart Array P800 */
 	0x3234103C, /* Smart Array P400 */
 	0x3235103C, /* Smart Array P400i */
@@ -586,12 +582,32 @@ static u32 unresettable_controller[] = {
 	0x3215103C, /* Smart Array E200i */
 	0x3237103C, /* Smart Array E500 */
 	0x323D103C, /* Smart Array P700m */
+	0x40800E11, /* Smart Array 5i */
 	0x409C0E11, /* Smart Array 6400 */
 	0x409D0E11, /* Smart Array 6400 EM */
+	0x40700E11, /* Smart Array 5300 */
+	0x40820E11, /* Smart Array 532 */
+	0x40830E11, /* Smart Array 5312 */
+	0x409A0E11, /* Smart Array 641 */
+	0x409B0E11, /* Smart Array 642 */
+	0x40910E11, /* Smart Array 6i */
 };
 
 /* List of controllers which cannot even be soft reset */
 static u32 soft_unresettable_controller[] = {
+	0x40800E11, /* Smart Array 5i */
+	0x40700E11, /* Smart Array 5300 */
+	0x40820E11, /* Smart Array 532 */
+	0x40830E11, /* Smart Array 5312 */
+	0x409A0E11, /* Smart Array 641 */
+	0x409B0E11, /* Smart Array 642 */
+	0x40910E11, /* Smart Array 6i */
+	/* Exclude 640x boards.  These are two pci devices in one slot
+	 * which share a battery backed cache module.  One controls the
+	 * cache, the other accesses the cache through the one that controls
+	 * it.  If we reset the one controlling the cache, the other will
+	 * likely not be happy.  Just forbid resetting this conjoined mess.
+	 */
 	0x409C0E11, /* Smart Array 6400 */
 	0x409D0E11, /* Smart Array 6400 EM */
 };
@@ -4667,8 +4683,7 @@ static int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
 	 */
 	cciss_lookup_board_id(pdev, &board_id);
 	if (!ctlr_is_resettable(board_id)) {
-		dev_warn(&pdev->dev, "Cannot reset Smart Array 640x "
-				"due to shared cache module.");
+		dev_warn(&pdev->dev, "Controller not resettable\n");
 		return -ENODEV;
 	}
 
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index b905e9888b888e..efd19c2da9c29b 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -38,6 +38,7 @@
 #include <linux/mutex.h>
 #include <linux/major.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/genhd.h>
 #include <linux/idr.h>
 #include <net/tcp.h>
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 81fde9ef7f8e0e..a1518539b85804 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2359,7 +2359,7 @@ static void drbd_cleanup(void)
  * @congested_data:	User data
  * @bdi_bits:		Bits the BDI flusher thread is currently interested in
  *
- * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
+ * Returns 1<<WB_async_congested and/or 1<<WB_sync_congested if we are congested.
  */
 static int drbd_congested(void *congested_data, int bdi_bits)
 {
@@ -2376,14 +2376,14 @@ static int drbd_congested(void *congested_data, int bdi_bits)
 	}
 
 	if (test_bit(CALLBACK_PENDING, &first_peer_device(device)->connection->flags)) {
-		r |= (1 << BDI_async_congested);
+		r |= (1 << WB_async_congested);
 		/* Without good local data, we would need to read from remote,
 		 * and that would need the worker thread as well, which is
 		 * currently blocked waiting for that usermode helper to
 		 * finish.
 		 */
 		if (!get_ldev_if_state(device, D_UP_TO_DATE))
-			r |= (1 << BDI_sync_congested);
+			r |= (1 << WB_sync_congested);
 		else
 			put_ldev(device);
 		r &= bdi_bits;
@@ -2399,9 +2399,9 @@ static int drbd_congested(void *congested_data, int bdi_bits)
 			reason = 'b';
 	}
 
-	if (bdi_bits & (1 << BDI_async_congested) &&
+	if (bdi_bits & (1 << WB_async_congested) &&
 	    test_bit(NET_CONGESTED, &first_peer_device(device)->connection->flags)) {
-		r |= (1 << BDI_async_congested);
+		r |= (1 << WB_async_congested);
 		reason = reason == 'b' ? 'a' : 'n';
 	}
 
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index d7173cb1ea76c2..40580dc7f41cac 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -86,8 +86,6 @@ static DEFINE_MUTEX(loop_index_mutex);
 static int max_part;
 static int part_shift;
 
-static struct workqueue_struct *loop_wq;
-
 static int transfer_xor(struct loop_device *lo, int cmd,
 			struct page *raw_page, unsigned raw_off,
 			struct page *loop_page, unsigned loop_off,
@@ -476,6 +474,28 @@ static int loop_flush(struct loop_device *lo)
 	return loop_switch(lo, NULL);
 }
 
+static void loop_reread_partitions(struct loop_device *lo,
+				   struct block_device *bdev)
+{
+	int rc;
+
+	/*
+	 * bd_mutex has been held already in release path, so don't
+	 * acquire it if this function is called in such case.
+	 *
+	 * If the reread partition isn't from release path, lo_refcnt
+	 * must be at least one and it can only become zero when the
+	 * current holder is released.
+	 */
+	if (!atomic_read(&lo->lo_refcnt))
+		rc = __blkdev_reread_part(bdev);
+	else
+		rc = blkdev_reread_part(bdev);
+	if (rc)
+		pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n",
+			__func__, lo->lo_number, lo->lo_file_name, rc);
+}
+
 /*
  * loop_change_fd switched the backing store of a loopback device to
  * a new file. This is useful for operating system installers to free up
@@ -524,7 +544,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
 
 	fput(old_file);
 	if (lo->lo_flags & LO_FLAGS_PARTSCAN)
-		ioctl_by_bdev(bdev, BLKRRPART, 0);
+		loop_reread_partitions(lo, bdev);
 	return 0;
 
  out_putf:
@@ -725,6 +745,12 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	size = get_loop_size(lo, file);
 	if ((loff_t)(sector_t)size != size)
 		goto out_putf;
+	error = -ENOMEM;
+	lo->wq = alloc_workqueue("kloopd%d",
+			WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 16,
+			lo->lo_number);
+	if (!lo->wq)
+		goto out_putf;
 
 	error = 0;
 
@@ -755,7 +781,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	if (part_shift)
 		lo->lo_flags |= LO_FLAGS_PARTSCAN;
 	if (lo->lo_flags & LO_FLAGS_PARTSCAN)
-		ioctl_by_bdev(bdev, BLKRRPART, 0);
+		loop_reread_partitions(lo, bdev);
 
 	/* Grab the block_device to prevent its destruction after we
 	 * put /dev/loopXX inode. Later in loop_clr_fd() we bdput(bdev).
@@ -827,7 +853,7 @@ static int loop_clr_fd(struct loop_device *lo)
 	 * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d
 	 * command to fail with EBUSY.
 	 */
-	if (lo->lo_refcnt > 1) {
+	if (atomic_read(&lo->lo_refcnt) > 1) {
 		lo->lo_flags |= LO_FLAGS_AUTOCLEAR;
 		mutex_unlock(&lo->lo_ctl_mutex);
 		return 0;
@@ -836,6 +862,9 @@ static int loop_clr_fd(struct loop_device *lo)
 	if (filp == NULL)
 		return -EINVAL;
 
+	/* freeze request queue during the transition */
+	blk_mq_freeze_queue(lo->lo_queue);
+
 	spin_lock_irq(&lo->lo_lock);
 	lo->lo_state = Lo_rundown;
 	lo->lo_backing_file = NULL;
@@ -867,11 +896,15 @@ static int loop_clr_fd(struct loop_device *lo)
 	lo->lo_state = Lo_unbound;
 	/* This is safe: open() is still holding a reference. */
 	module_put(THIS_MODULE);
+	blk_mq_unfreeze_queue(lo->lo_queue);
+
 	if (lo->lo_flags & LO_FLAGS_PARTSCAN && bdev)
-		ioctl_by_bdev(bdev, BLKRRPART, 0);
+		loop_reread_partitions(lo, bdev);
 	lo->lo_flags = 0;
 	if (!part_shift)
 		lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
+	destroy_workqueue(lo->wq);
+	lo->wq = NULL;
 	mutex_unlock(&lo->lo_ctl_mutex);
 	/*
 	 * Need not hold lo_ctl_mutex to fput backing file.
@@ -943,7 +976,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	     !(lo->lo_flags & LO_FLAGS_PARTSCAN)) {
 		lo->lo_flags |= LO_FLAGS_PARTSCAN;
 		lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN;
-		ioctl_by_bdev(lo->lo_device, BLKRRPART, 0);
+		loop_reread_partitions(lo, lo->lo_device);
 	}
 
 	lo->lo_encrypt_key_size = info->lo_encrypt_key_size;
@@ -1324,9 +1357,7 @@ static int lo_open(struct block_device *bdev, fmode_t mode)
 		goto out;
 	}
 
-	mutex_lock(&lo->lo_ctl_mutex);
-	lo->lo_refcnt++;
-	mutex_unlock(&lo->lo_ctl_mutex);
+	atomic_inc(&lo->lo_refcnt);
 out:
 	mutex_unlock(&loop_index_mutex);
 	return err;
@@ -1337,11 +1368,10 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 	struct loop_device *lo = disk->private_data;
 	int err;
 
-	mutex_lock(&lo->lo_ctl_mutex);
-
-	if (--lo->lo_refcnt)
-		goto out;
+	if (atomic_dec_return(&lo->lo_refcnt))
+		return;
 
+	mutex_lock(&lo->lo_ctl_mutex);
 	if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
 		/*
 		 * In autoclear mode, stop the loop thread
@@ -1358,7 +1388,6 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 		loop_flush(lo);
 	}
 
-out:
 	mutex_unlock(&lo->lo_ctl_mutex);
 }
 
@@ -1425,9 +1454,13 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 		const struct blk_mq_queue_data *bd)
 {
 	struct loop_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
+	struct loop_device *lo = cmd->rq->q->queuedata;
 
 	blk_mq_start_request(bd->rq);
 
+	if (lo->lo_state != Lo_bound)
+		return -EIO;
+
 	if (cmd->rq->cmd_flags & REQ_WRITE) {
 		struct loop_device *lo = cmd->rq->q->queuedata;
 		bool need_sched = true;
@@ -1441,9 +1474,9 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 		spin_unlock_irq(&lo->lo_lock);
 
 		if (need_sched)
-			queue_work(loop_wq, &lo->write_work);
+			queue_work(lo->wq, &lo->write_work);
 	} else {
-		queue_work(loop_wq, &cmd->read_work);
+		queue_work(lo->wq, &cmd->read_work);
 	}
 
 	return BLK_MQ_RQ_QUEUE_OK;
@@ -1455,9 +1488,6 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
 	struct loop_device *lo = cmd->rq->q->queuedata;
 	int ret = -EIO;
 
-	if (lo->lo_state != Lo_bound)
-		goto failed;
-
 	if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY))
 		goto failed;
 
@@ -1594,6 +1624,7 @@ static int loop_add(struct loop_device **l, int i)
 		disk->flags |= GENHD_FL_NO_PART_SCAN;
 	disk->flags |= GENHD_FL_EXT_DEVT;
 	mutex_init(&lo->lo_ctl_mutex);
+	atomic_set(&lo->lo_refcnt, 0);
 	lo->lo_number		= i;
 	spin_lock_init(&lo->lo_lock);
 	disk->major		= LOOP_MAJOR;
@@ -1711,7 +1742,7 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd,
 			mutex_unlock(&lo->lo_ctl_mutex);
 			break;
 		}
-		if (lo->lo_refcnt > 0) {
+		if (atomic_read(&lo->lo_refcnt) > 0) {
 			ret = -EBUSY;
 			mutex_unlock(&lo->lo_ctl_mutex);
 			break;
@@ -1806,13 +1837,6 @@ static int __init loop_init(void)
 		goto misc_out;
 	}
 
-	loop_wq = alloc_workqueue("kloopd",
-			WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 0);
-	if (!loop_wq) {
-		err = -ENOMEM;
-		goto misc_out;
-	}
-
 	blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
 				  THIS_MODULE, loop_probe, NULL, NULL);
 
@@ -1850,8 +1874,6 @@ static void __exit loop_exit(void)
 	blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range);
 	unregister_blkdev(LOOP_MAJOR, "loop");
 
-	destroy_workqueue(loop_wq);
-
 	misc_deregister(&loop_misc);
 }
 
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 301c27f8323ffd..25e8997ed2467f 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -28,7 +28,7 @@ struct loop_func_table;
 
 struct loop_device {
 	int		lo_number;
-	int		lo_refcnt;
+	atomic_t	lo_refcnt;
 	loff_t		lo_offset;
 	loff_t		lo_sizelimit;
 	int		lo_flags;
@@ -54,6 +54,7 @@ struct loop_device {
 	gfp_t		old_gfp_mask;
 
 	spinlock_t		lo_lock;
+	struct workqueue_struct *wq;
 	struct list_head	write_cmd_head;
 	struct work_struct	write_work;
 	bool			write_started;
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 39e5f7fae3efb7..0e385d8e9b86ec 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -230,29 +230,40 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req)
 	int result, flags;
 	struct nbd_request request;
 	unsigned long size = blk_rq_bytes(req);
+	u32 type;
+
+	if (req->cmd_type == REQ_TYPE_DRV_PRIV)
+		type = NBD_CMD_DISC;
+	else if (req->cmd_flags & REQ_DISCARD)
+		type = NBD_CMD_TRIM;
+	else if (req->cmd_flags & REQ_FLUSH)
+		type = NBD_CMD_FLUSH;
+	else if (rq_data_dir(req) == WRITE)
+		type = NBD_CMD_WRITE;
+	else
+		type = NBD_CMD_READ;
 
 	memset(&request, 0, sizeof(request));
 	request.magic = htonl(NBD_REQUEST_MAGIC);
-	request.type = htonl(nbd_cmd(req));
-
-	if (nbd_cmd(req) != NBD_CMD_FLUSH && nbd_cmd(req) != NBD_CMD_DISC) {
+	request.type = htonl(type);
+	if (type != NBD_CMD_FLUSH && type != NBD_CMD_DISC) {
 		request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
 		request.len = htonl(size);
 	}
 	memcpy(request.handle, &req, sizeof(req));
 
 	dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
-		req, nbdcmd_to_ascii(nbd_cmd(req)),
+		req, nbdcmd_to_ascii(type),
 		(unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
 	result = sock_xmit(nbd, 1, &request, sizeof(request),
-			(nbd_cmd(req) == NBD_CMD_WRITE) ? MSG_MORE : 0);
+			(type == NBD_CMD_WRITE) ? MSG_MORE : 0);
 	if (result <= 0) {
 		dev_err(disk_to_dev(nbd->disk),
 			"Send control failed (result %d)\n", result);
 		return -EIO;
 	}
 
-	if (nbd_cmd(req) == NBD_CMD_WRITE) {
+	if (type == NBD_CMD_WRITE) {
 		struct req_iterator iter;
 		struct bio_vec bvec;
 		/*
@@ -352,7 +363,7 @@ static struct request *nbd_read_stat(struct nbd_device *nbd)
 	}
 
 	dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req);
-	if (nbd_cmd(req) == NBD_CMD_READ) {
+	if (rq_data_dir(req) != WRITE) {
 		struct req_iterator iter;
 		struct bio_vec bvec;
 
@@ -452,23 +463,11 @@ static void nbd_handle_req(struct nbd_device *nbd, struct request *req)
 	if (req->cmd_type != REQ_TYPE_FS)
 		goto error_out;
 
-	nbd_cmd(req) = NBD_CMD_READ;
-	if (rq_data_dir(req) == WRITE) {
-		if ((req->cmd_flags & REQ_DISCARD)) {
-			WARN_ON(!(nbd->flags & NBD_FLAG_SEND_TRIM));
-			nbd_cmd(req) = NBD_CMD_TRIM;
-		} else
-			nbd_cmd(req) = NBD_CMD_WRITE;
-		if (nbd->flags & NBD_FLAG_READ_ONLY) {
-			dev_err(disk_to_dev(nbd->disk),
-				"Write on read-only\n");
-			goto error_out;
-		}
-	}
-
-	if (req->cmd_flags & REQ_FLUSH) {
-		BUG_ON(unlikely(blk_rq_sectors(req)));
-		nbd_cmd(req) = NBD_CMD_FLUSH;
+	if (rq_data_dir(req) == WRITE &&
+	    (nbd->flags & NBD_FLAG_READ_ONLY)) {
+		dev_err(disk_to_dev(nbd->disk),
+			"Write on read-only\n");
+		goto error_out;
 	}
 
 	req->errors = 0;
@@ -592,8 +591,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		fsync_bdev(bdev);
 		mutex_lock(&nbd->tx_lock);
 		blk_rq_init(NULL, &sreq);
-		sreq.cmd_type = REQ_TYPE_SPECIAL;
-		nbd_cmd(&sreq) = NBD_CMD_DISC;
+		sreq.cmd_type = REQ_TYPE_DRV_PRIV;
 
 		/* Check again after getting mutex back.  */
 		if (!nbd->sock)
@@ -713,7 +711,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		bdev->bd_inode->i_size = 0;
 		set_capacity(nbd->disk, 0);
 		if (max_part > 0)
-			ioctl_by_bdev(bdev, BLKRRPART, 0);
+			blkdev_reread_part(bdev);
 		if (nbd->disconnect) /* user requested, ignore socket errors */
 			return 0;
 		return nbd->harderror;
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 0b4b25617bb7a0..69de41a87b7431 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -243,6 +243,17 @@ static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer)
 			cmd = container_of(entry, struct nullb_cmd, ll_list);
 			entry = entry->next;
 			end_cmd(cmd);
+
+			if (cmd->rq) {
+				struct request_queue *q = cmd->rq->q;
+
+				if (!q->mq_ops && blk_queue_stopped(q)) {
+					spin_lock(q->queue_lock);
+					if (blk_queue_stopped(q))
+						blk_start_queue(q);
+					spin_unlock(q->queue_lock);
+				}
+			}
 		} while (entry);
 	}
 
@@ -257,7 +268,7 @@ static void null_cmd_end_timer(struct nullb_cmd *cmd)
 	if (llist_add(&cmd->ll_list, &cq->list)) {
 		ktime_t kt = ktime_set(0, completion_nsec);
 
-		hrtimer_start(&cq->timer, kt, HRTIMER_MODE_REL);
+		hrtimer_start(&cq->timer, kt, HRTIMER_MODE_REL_PINNED);
 	}
 
 	put_cpu();
@@ -334,6 +345,7 @@ static int null_rq_prep_fn(struct request_queue *q, struct request *req)
 		req->special = cmd;
 		return BLKPREP_OK;
 	}
+	blk_stop_queue(q);
 
 	return BLKPREP_DEFER;
 }
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 683dff272562b1..e19fa5c21c1486 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -102,6 +102,7 @@ struct nvme_queue {
 	spinlock_t q_lock;
 	struct nvme_command *sq_cmds;
 	volatile struct nvme_completion *cqes;
+	struct blk_mq_tags **tags;
 	dma_addr_t sq_dma_addr;
 	dma_addr_t cq_dma_addr;
 	u32 __iomem *q_db;
@@ -114,7 +115,6 @@ struct nvme_queue {
 	u8 cq_phase;
 	u8 cqe_seen;
 	struct async_cmd_info cmdinfo;
-	struct blk_mq_hw_ctx *hctx;
 };
 
 /*
@@ -182,9 +182,12 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 	struct nvme_dev *dev = data;
 	struct nvme_queue *nvmeq = dev->queues[0];
 
-	WARN_ON(nvmeq->hctx);
-	nvmeq->hctx = hctx;
+	WARN_ON(hctx_idx != 0);
+	WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
+	WARN_ON(nvmeq->tags);
+
 	hctx->driver_data = nvmeq;
+	nvmeq->tags = &dev->admin_tagset.tags[0];
 	return 0;
 }
 
@@ -201,27 +204,16 @@ static int nvme_admin_init_request(void *data, struct request *req,
 	return 0;
 }
 
-static void nvme_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
-{
-	struct nvme_queue *nvmeq = hctx->driver_data;
-
-	nvmeq->hctx = NULL;
-}
-
 static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 			  unsigned int hctx_idx)
 {
 	struct nvme_dev *dev = data;
-	struct nvme_queue *nvmeq = dev->queues[
-					(hctx_idx % dev->queue_count) + 1];
-
-	if (!nvmeq->hctx)
-		nvmeq->hctx = hctx;
+	struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
 
-	/* nvmeq queues are shared between namespaces. We assume here that
-	 * blk-mq map the tags so they match up with the nvme queue tags. */
-	WARN_ON(nvmeq->hctx->tags != hctx->tags);
+	if (!nvmeq->tags)
+		nvmeq->tags = &dev->tagset.tags[hctx_idx];
 
+	WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
 	hctx->driver_data = nvmeq;
 	return 0;
 }
@@ -320,7 +312,7 @@ static void abort_completion(struct nvme_queue *nvmeq, void *ctx,
 	u16 status = le16_to_cpup(&cqe->status) >> 1;
 	u32 result = le32_to_cpup(&cqe->result);
 
-	blk_mq_free_hctx_request(nvmeq->hctx, req);
+	blk_mq_free_request(req);
 
 	dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result);
 	++nvmeq->dev->abort_limit;
@@ -333,14 +325,13 @@ static void async_completion(struct nvme_queue *nvmeq, void *ctx,
 	cmdinfo->result = le32_to_cpup(&cqe->result);
 	cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
 	queue_kthread_work(cmdinfo->worker, &cmdinfo->work);
-	blk_mq_free_hctx_request(nvmeq->hctx, cmdinfo->req);
+	blk_mq_free_request(cmdinfo->req);
 }
 
 static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq,
 				  unsigned int tag)
 {
-	struct blk_mq_hw_ctx *hctx = nvmeq->hctx;
-	struct request *req = blk_mq_tag_to_rq(hctx->tags, tag);
+	struct request *req = blk_mq_tag_to_rq(*nvmeq->tags, tag);
 
 	return blk_mq_rq_to_pdu(req);
 }
@@ -445,7 +436,7 @@ static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev,
 				(unsigned long) rq, gfp);
 }
 
-void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
+static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
 {
 	const int last_prp = dev->page_size / 8 - 1;
 	int i;
@@ -605,22 +596,30 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
 			spin_unlock_irqrestore(req->q->queue_lock, flags);
 			return;
 		}
-		req->errors = nvme_error_status(status);
+		if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
+			req->errors = status;
+		} else {
+			req->errors = nvme_error_status(status);
+		}
 	} else
 		req->errors = 0;
+	if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
+		u32 result = le32_to_cpup(&cqe->result);
+		req->special = (void *)(uintptr_t)result;
+	}
 
 	if (cmd_rq->aborted)
-		dev_warn(&nvmeq->dev->pci_dev->dev,
+		dev_warn(nvmeq->dev->dev,
 			"completing aborted command with status:%04x\n",
 			status);
 
 	if (iod->nents) {
-		dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, iod->nents,
+		dma_unmap_sg(nvmeq->dev->dev, iod->sg, iod->nents,
 			rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 		if (blk_integrity_rq(req)) {
 			if (!rq_data_dir(req))
 				nvme_dif_remap(req, nvme_dif_complete);
-			dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->meta_sg, 1,
+			dma_unmap_sg(nvmeq->dev->dev, iod->meta_sg, 1,
 				rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 		}
 	}
@@ -630,8 +629,8 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
 }
 
 /* length is in bytes.  gfp flags indicates whether we may sleep. */
-int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len,
-								gfp_t gfp)
+static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod,
+		int total_len, gfp_t gfp)
 {
 	struct dma_pool *pool;
 	int length = total_len;
@@ -709,6 +708,23 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len,
 	return total_len;
 }
 
+static void nvme_submit_priv(struct nvme_queue *nvmeq, struct request *req,
+		struct nvme_iod *iod)
+{
+	struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
+
+	memcpy(cmnd, req->cmd, sizeof(struct nvme_command));
+	cmnd->rw.command_id = req->tag;
+	if (req->nr_phys_segments) {
+		cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+		cmnd->rw.prp2 = cpu_to_le64(iod->first_dma);
+	}
+
+	if (++nvmeq->sq_tail == nvmeq->q_depth)
+		nvmeq->sq_tail = 0;
+	writel(nvmeq->sq_tail, nvmeq->q_db);
+}
+
 /*
  * We reuse the small pool to allocate the 16-byte range here as it is not
  * worth having a special pool for these or additional cases to handle freeing
@@ -807,11 +823,15 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
 	return 0;
 }
 
+/*
+ * NOTE: ns is NULL when called on the admin queue.
+ */
 static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 			 const struct blk_mq_queue_data *bd)
 {
 	struct nvme_ns *ns = hctx->queue->queuedata;
 	struct nvme_queue *nvmeq = hctx->driver_data;
+	struct nvme_dev *dev = nvmeq->dev;
 	struct request *req = bd->rq;
 	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
 	struct nvme_iod *iod;
@@ -822,7 +842,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	 * unless this namespace is formated such that the metadata can be
 	 * stripped/generated by the controller with PRACT=1.
 	 */
-	if (ns->ms && !blk_integrity_rq(req)) {
+	if (ns && ns->ms && !blk_integrity_rq(req)) {
 		if (!(ns->pi_type && ns->ms == 8)) {
 			req->errors = -EFAULT;
 			blk_mq_complete_request(req);
@@ -830,7 +850,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 		}
 	}
 
-	iod = nvme_alloc_iod(req, ns->dev, GFP_ATOMIC);
+	iod = nvme_alloc_iod(req, dev, GFP_ATOMIC);
 	if (!iod)
 		return BLK_MQ_RQ_QUEUE_BUSY;
 
@@ -841,8 +861,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 		 * as it is not worth having a special pool for these or
 		 * additional cases to handle freeing the iod.
 		 */
-		range = dma_pool_alloc(nvmeq->dev->prp_small_pool,
-						GFP_ATOMIC,
+		range = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC,
 						&iod->first_dma);
 		if (!range)
 			goto retry_cmd;
@@ -860,9 +879,8 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 			goto retry_cmd;
 
 		if (blk_rq_bytes(req) !=
-                    nvme_setup_prps(nvmeq->dev, iod, blk_rq_bytes(req), GFP_ATOMIC)) {
-			dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg,
-					iod->nents, dma_dir);
+                    nvme_setup_prps(dev, iod, blk_rq_bytes(req), GFP_ATOMIC)) {
+			dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
 			goto retry_cmd;
 		}
 		if (blk_integrity_rq(req)) {
@@ -884,7 +902,9 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 	nvme_set_info(cmd, iod, req_completion);
 	spin_lock_irq(&nvmeq->q_lock);
-	if (req->cmd_flags & REQ_DISCARD)
+	if (req->cmd_type == REQ_TYPE_DRV_PRIV)
+		nvme_submit_priv(nvmeq, req, iod);
+	else if (req->cmd_flags & REQ_DISCARD)
 		nvme_submit_discard(nvmeq, ns, req, iod);
 	else if (req->cmd_flags & REQ_FLUSH)
 		nvme_submit_flush(nvmeq, ns, req->tag);
@@ -896,10 +916,10 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return BLK_MQ_RQ_QUEUE_OK;
 
  error_cmd:
-	nvme_free_iod(nvmeq->dev, iod);
+	nvme_free_iod(dev, iod);
 	return BLK_MQ_RQ_QUEUE_ERROR;
  retry_cmd:
-	nvme_free_iod(nvmeq->dev, iod);
+	nvme_free_iod(dev, iod);
 	return BLK_MQ_RQ_QUEUE_BUSY;
 }
 
@@ -942,15 +962,6 @@ static int nvme_process_cq(struct nvme_queue *nvmeq)
 	return 1;
 }
 
-/* Admin queue isn't initialized as a request queue. If at some point this
- * happens anyway, make sure to notify the user */
-static int nvme_admin_queue_rq(struct blk_mq_hw_ctx *hctx,
-			       const struct blk_mq_queue_data *bd)
-{
-	WARN_ON_ONCE(1);
-	return BLK_MQ_RQ_QUEUE_ERROR;
-}
-
 static irqreturn_t nvme_irq(int irq, void *data)
 {
 	irqreturn_t result;
@@ -972,46 +983,61 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
 	return IRQ_WAKE_THREAD;
 }
 
-struct sync_cmd_info {
-	struct task_struct *task;
-	u32 result;
-	int status;
-};
-
-static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
-						struct nvme_completion *cqe)
-{
-	struct sync_cmd_info *cmdinfo = ctx;
-	cmdinfo->result = le32_to_cpup(&cqe->result);
-	cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
-	wake_up_process(cmdinfo->task);
-}
-
 /*
  * Returns 0 on success.  If the result is negative, it's a Linux error code;
  * if the result is positive, it's an NVM Express status code
  */
-static int nvme_submit_sync_cmd(struct request *req, struct nvme_command *cmd,
-						u32 *result, unsigned timeout)
+int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
+		void *buffer, void __user *ubuffer, unsigned bufflen,
+		u32 *result, unsigned timeout)
 {
-	struct sync_cmd_info cmdinfo;
-	struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
-	struct nvme_queue *nvmeq = cmd_rq->nvmeq;
+	bool write = cmd->common.opcode & 1;
+	struct bio *bio = NULL;
+	struct request *req;
+	int ret;
+
+	req = blk_mq_alloc_request(q, write, GFP_KERNEL, false);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
-	cmdinfo.task = current;
-	cmdinfo.status = -EINTR;
+	req->cmd_type = REQ_TYPE_DRV_PRIV;
+	req->cmd_flags = REQ_FAILFAST_DRIVER;
+	req->__data_len = 0;
+	req->__sector = (sector_t) -1;
+	req->bio = req->biotail = NULL;
 
-	cmd->common.command_id = req->tag;
+	req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
 
-	nvme_set_info(cmd_rq, &cmdinfo, sync_completion);
+	req->cmd = (unsigned char *)cmd;
+	req->cmd_len = sizeof(struct nvme_command);
+	req->special = (void *)0;
 
-	set_current_state(TASK_UNINTERRUPTIBLE);
-	nvme_submit_cmd(nvmeq, cmd);
-	schedule();
+	if (buffer && bufflen) {
+		ret = blk_rq_map_kern(q, req, buffer, bufflen, __GFP_WAIT);
+		if (ret)
+			goto out;
+	} else if (ubuffer && bufflen) {
+		ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, __GFP_WAIT);
+		if (ret)
+			goto out;
+		bio = req->bio;
+	}
 
+	blk_execute_rq(req->q, NULL, req, 0);
+	if (bio)
+		blk_rq_unmap_user(bio);
 	if (result)
-		*result = cmdinfo.result;
-	return cmdinfo.status;
+		*result = (u32)(uintptr_t)req->special;
+	ret = req->errors;
+ out:
+	blk_mq_free_request(req);
+	return ret;
+}
+
+int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
+		void *buffer, unsigned bufflen)
+{
+	return __nvme_submit_sync_cmd(q, cmd, buffer, NULL, bufflen, NULL, 0);
 }
 
 static int nvme_submit_async_admin_req(struct nvme_dev *dev)
@@ -1033,7 +1059,7 @@ static int nvme_submit_async_admin_req(struct nvme_dev *dev)
 	c.common.opcode = nvme_admin_async_event;
 	c.common.command_id = req->tag;
 
-	blk_mq_free_hctx_request(nvmeq->hctx, req);
+	blk_mq_free_request(req);
 	return __nvme_submit_cmd(nvmeq, &c);
 }
 
@@ -1060,41 +1086,6 @@ static int nvme_submit_admin_async_cmd(struct nvme_dev *dev,
 	return nvme_submit_cmd(nvmeq, cmd);
 }
 
-static int __nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
-						u32 *result, unsigned timeout)
-{
-	int res;
-	struct request *req;
-
-	req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
-	res = nvme_submit_sync_cmd(req, cmd, result, timeout);
-	blk_mq_free_request(req);
-	return res;
-}
-
-int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
-								u32 *result)
-{
-	return __nvme_submit_admin_cmd(dev, cmd, result, ADMIN_TIMEOUT);
-}
-
-int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
-					struct nvme_command *cmd, u32 *result)
-{
-	int res;
-	struct request *req;
-
-	req = blk_mq_alloc_request(ns->queue, WRITE, (GFP_KERNEL|__GFP_WAIT),
-									false);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
-	res = nvme_submit_sync_cmd(req, cmd, result, NVME_IO_TIMEOUT);
-	blk_mq_free_request(req);
-	return res;
-}
-
 static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
 {
 	struct nvme_command c;
@@ -1103,7 +1094,7 @@ static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
 	c.delete_queue.opcode = opcode;
 	c.delete_queue.qid = cpu_to_le16(id);
 
-	return nvme_submit_admin_cmd(dev, &c, NULL);
+	return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0);
 }
 
 static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
@@ -1112,6 +1103,10 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
 	struct nvme_command c;
 	int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
 
+	/*
+	 * Note: we (ab)use the fact the the prp fields survive if no data
+	 * is attached to the request.
+	 */
 	memset(&c, 0, sizeof(c));
 	c.create_cq.opcode = nvme_admin_create_cq;
 	c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
@@ -1120,7 +1115,7 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
 	c.create_cq.cq_flags = cpu_to_le16(flags);
 	c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
 
-	return nvme_submit_admin_cmd(dev, &c, NULL);
+	return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0);
 }
 
 static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
@@ -1129,6 +1124,10 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
 	struct nvme_command c;
 	int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;
 
+	/*
+	 * Note: we (ab)use the fact the the prp fields survive if no data
+	 * is attached to the request.
+	 */
 	memset(&c, 0, sizeof(c));
 	c.create_sq.opcode = nvme_admin_create_sq;
 	c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
@@ -1137,7 +1136,7 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
 	c.create_sq.sq_flags = cpu_to_le16(flags);
 	c.create_sq.cqid = cpu_to_le16(qid);
 
-	return nvme_submit_admin_cmd(dev, &c, NULL);
+	return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0);
 }
 
 static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
@@ -1150,18 +1149,43 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
 	return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
 }
 
-int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns,
-							dma_addr_t dma_addr)
+int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id)
 {
-	struct nvme_command c;
+	struct nvme_command c = {
+		.identify.opcode = nvme_admin_identify,
+		.identify.cns = cpu_to_le32(1),
+	};
+	int error;
 
-	memset(&c, 0, sizeof(c));
-	c.identify.opcode = nvme_admin_identify;
-	c.identify.nsid = cpu_to_le32(nsid);
-	c.identify.prp1 = cpu_to_le64(dma_addr);
-	c.identify.cns = cpu_to_le32(cns);
+	*id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
+	if (!*id)
+		return -ENOMEM;
 
-	return nvme_submit_admin_cmd(dev, &c, NULL);
+	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
+			sizeof(struct nvme_id_ctrl));
+	if (error)
+		kfree(*id);
+	return error;
+}
+
+int nvme_identify_ns(struct nvme_dev *dev, unsigned nsid,
+		struct nvme_id_ns **id)
+{
+	struct nvme_command c = {
+		.identify.opcode = nvme_admin_identify,
+		.identify.nsid = cpu_to_le32(nsid),
+	};
+	int error;
+
+	*id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL);
+	if (!*id)
+		return -ENOMEM;
+
+	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
+			sizeof(struct nvme_id_ns));
+	if (error)
+		kfree(*id);
+	return error;
 }
 
 int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid,
@@ -1175,7 +1199,8 @@ int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid,
 	c.features.prp1 = cpu_to_le64(dma_addr);
 	c.features.fid = cpu_to_le32(fid);
 
-	return nvme_submit_admin_cmd(dev, &c, result);
+	return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0,
+			result, 0);
 }
 
 int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,
@@ -1189,7 +1214,30 @@ int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,
 	c.features.fid = cpu_to_le32(fid);
 	c.features.dword11 = cpu_to_le32(dword11);
 
-	return nvme_submit_admin_cmd(dev, &c, result);
+	return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0,
+			result, 0);
+}
+
+int nvme_get_log_page(struct nvme_dev *dev, struct nvme_smart_log **log)
+{
+	struct nvme_command c = {
+		.common.opcode = nvme_admin_get_log_page,
+		.common.nsid = cpu_to_le32(0xFFFFFFFF),
+		.common.cdw10[0] = cpu_to_le32(
+			(((sizeof(struct nvme_smart_log) / 4) - 1) << 16) |
+			 NVME_LOG_SMART),
+	};
+	int error;
+
+	*log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL);
+	if (!*log)
+		return -ENOMEM;
+
+	error = nvme_submit_sync_cmd(dev->admin_q, &c, *log,
+			sizeof(struct nvme_smart_log));
+	if (error)
+		kfree(*log);
+	return error;
 }
 
 /**
@@ -1214,8 +1262,7 @@ static void nvme_abort_req(struct request *req)
 		if (work_busy(&dev->reset_work))
 			goto out;
 		list_del_init(&dev->node);
-		dev_warn(&dev->pci_dev->dev,
-			"I/O %d QID %d timeout, reset controller\n",
+		dev_warn(dev->dev, "I/O %d QID %d timeout, reset controller\n",
 							req->tag, nvmeq->qid);
 		dev->reset_workfn = nvme_reset_failed_dev;
 		queue_work(nvme_workq, &dev->reset_work);
@@ -1254,8 +1301,7 @@ static void nvme_abort_req(struct request *req)
 	}
 }
 
-static void nvme_cancel_queue_ios(struct blk_mq_hw_ctx *hctx,
-				struct request *req, void *data, bool reserved)
+static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved)
 {
 	struct nvme_queue *nvmeq = data;
 	void *ctx;
@@ -1352,11 +1398,9 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
 
 static void nvme_clear_queue(struct nvme_queue *nvmeq)
 {
-	struct blk_mq_hw_ctx *hctx = nvmeq->hctx;
-
 	spin_lock_irq(&nvmeq->q_lock);
-	if (hctx && hctx->tags)
-		blk_mq_tag_busy_iter(hctx, nvme_cancel_queue_ios, nvmeq);
+	if (nvmeq->tags && *nvmeq->tags)
+		blk_mq_all_tag_busy_iter(*nvmeq->tags, nvme_cancel_queue_ios, nvmeq);
 	spin_unlock_irq(&nvmeq->q_lock);
 }
 
@@ -1384,22 +1428,21 @@ static void nvme_disable_queue(struct nvme_dev *dev, int qid)
 static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 							int depth)
 {
-	struct device *dmadev = &dev->pci_dev->dev;
 	struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL);
 	if (!nvmeq)
 		return NULL;
 
-	nvmeq->cqes = dma_zalloc_coherent(dmadev, CQ_SIZE(depth),
+	nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth),
 					  &nvmeq->cq_dma_addr, GFP_KERNEL);
 	if (!nvmeq->cqes)
 		goto free_nvmeq;
 
-	nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth),
+	nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
 					&nvmeq->sq_dma_addr, GFP_KERNEL);
 	if (!nvmeq->sq_cmds)
 		goto free_cqdma;
 
-	nvmeq->q_dmadev = dmadev;
+	nvmeq->q_dmadev = dev->dev;
 	nvmeq->dev = dev;
 	snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d",
 			dev->instance, qid);
@@ -1415,7 +1458,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 	return nvmeq;
 
  free_cqdma:
-	dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes,
+	dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes,
 							nvmeq->cq_dma_addr);
  free_nvmeq:
 	kfree(nvmeq);
@@ -1487,7 +1530,7 @@ static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled)
 		if (fatal_signal_pending(current))
 			return -EINTR;
 		if (time_after(jiffies, timeout)) {
-			dev_err(&dev->pci_dev->dev,
+			dev_err(dev->dev,
 				"Device not ready; aborting %s\n", enabled ?
 						"initialisation" : "reset");
 			return -ENODEV;
@@ -1537,7 +1580,7 @@ static int nvme_shutdown_ctrl(struct nvme_dev *dev)
 		if (fatal_signal_pending(current))
 			return -EINTR;
 		if (time_after(jiffies, timeout)) {
-			dev_err(&dev->pci_dev->dev,
+			dev_err(dev->dev,
 				"Device shutdown incomplete; abort shutdown\n");
 			return -ENODEV;
 		}
@@ -1547,10 +1590,9 @@ static int nvme_shutdown_ctrl(struct nvme_dev *dev)
 }
 
 static struct blk_mq_ops nvme_mq_admin_ops = {
-	.queue_rq	= nvme_admin_queue_rq,
+	.queue_rq	= nvme_queue_rq,
 	.map_queue	= blk_mq_map_queue,
 	.init_hctx	= nvme_admin_init_hctx,
-	.exit_hctx	= nvme_exit_hctx,
 	.init_request	= nvme_admin_init_request,
 	.timeout	= nvme_timeout,
 };
@@ -1559,7 +1601,6 @@ static struct blk_mq_ops nvme_mq_ops = {
 	.queue_rq	= nvme_queue_rq,
 	.map_queue	= blk_mq_map_queue,
 	.init_hctx	= nvme_init_hctx,
-	.exit_hctx	= nvme_exit_hctx,
 	.init_request	= nvme_init_request,
 	.timeout	= nvme_timeout,
 };
@@ -1580,7 +1621,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
 		dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1;
 		dev->admin_tagset.reserved_tags = 1;
 		dev->admin_tagset.timeout = ADMIN_TIMEOUT;
-		dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
+		dev->admin_tagset.numa_node = dev_to_node(dev->dev);
 		dev->admin_tagset.cmd_size = nvme_cmd_size(dev);
 		dev->admin_tagset.driver_data = dev;
 
@@ -1613,14 +1654,14 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
 	unsigned dev_page_max = NVME_CAP_MPSMAX(cap) + 12;
 
 	if (page_shift < dev_page_min) {
-		dev_err(&dev->pci_dev->dev,
+		dev_err(dev->dev,
 				"Minimum device page size (%u) too large for "
 				"host (%u)\n", 1 << dev_page_min,
 				1 << page_shift);
 		return -ENODEV;
 	}
 	if (page_shift > dev_page_max) {
-		dev_info(&dev->pci_dev->dev,
+		dev_info(dev->dev,
 				"Device maximum page size (%u) smaller than "
 				"host (%u); enabling work-around\n",
 				1 << dev_page_max, 1 << page_shift);
@@ -1668,126 +1709,44 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
 	return result;
 }
 
-struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
-				unsigned long addr, unsigned length)
-{
-	int i, err, count, nents, offset;
-	struct scatterlist *sg;
-	struct page **pages;
-	struct nvme_iod *iod;
-
-	if (addr & 3)
-		return ERR_PTR(-EINVAL);
-	if (!length || length > INT_MAX - PAGE_SIZE)
-		return ERR_PTR(-EINVAL);
-
-	offset = offset_in_page(addr);
-	count = DIV_ROUND_UP(offset + length, PAGE_SIZE);
-	pages = kcalloc(count, sizeof(*pages), GFP_KERNEL);
-	if (!pages)
-		return ERR_PTR(-ENOMEM);
-
-	err = get_user_pages_fast(addr, count, 1, pages);
-	if (err < count) {
-		count = err;
-		err = -EFAULT;
-		goto put_pages;
-	}
-
-	err = -ENOMEM;
-	iod = __nvme_alloc_iod(count, length, dev, 0, GFP_KERNEL);
-	if (!iod)
-		goto put_pages;
-
-	sg = iod->sg;
-	sg_init_table(sg, count);
-	for (i = 0; i < count; i++) {
-		sg_set_page(&sg[i], pages[i],
-			    min_t(unsigned, length, PAGE_SIZE - offset),
-			    offset);
-		length -= (PAGE_SIZE - offset);
-		offset = 0;
-	}
-	sg_mark_end(&sg[i - 1]);
-	iod->nents = count;
-
-	nents = dma_map_sg(&dev->pci_dev->dev, sg, count,
-				write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-	if (!nents)
-		goto free_iod;
-
-	kfree(pages);
-	return iod;
-
- free_iod:
-	kfree(iod);
- put_pages:
-	for (i = 0; i < count; i++)
-		put_page(pages[i]);
-	kfree(pages);
-	return ERR_PTR(err);
-}
-
-void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
-			struct nvme_iod *iod)
-{
-	int i;
-
-	dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents,
-				write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-
-	for (i = 0; i < iod->nents; i++)
-		put_page(sg_page(&iod->sg[i]));
-}
-
 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 {
 	struct nvme_dev *dev = ns->dev;
 	struct nvme_user_io io;
 	struct nvme_command c;
-	unsigned length, meta_len, prp_len;
+	unsigned length, meta_len;
 	int status, write;
-	struct nvme_iod *iod;
 	dma_addr_t meta_dma = 0;
 	void *meta = NULL;
 	void __user *metadata;
 
 	if (copy_from_user(&io, uio, sizeof(io)))
 		return -EFAULT;
-	length = (io.nblocks + 1) << ns->lba_shift;
-	meta_len = (io.nblocks + 1) * ns->ms;
-
-	if (meta_len && ((io.metadata & 3) || !io.metadata) && !ns->ext)
-		return -EINVAL;
-	else if (meta_len && ns->ext) {
-		length += meta_len;
-		meta_len = 0;
-	}
-
-	metadata = (void __user *)(unsigned long)io.metadata;
-
-	write = io.opcode & 1;
 
 	switch (io.opcode) {
 	case nvme_cmd_write:
 	case nvme_cmd_read:
 	case nvme_cmd_compare:
-		iod = nvme_map_user_pages(dev, write, io.addr, length);
 		break;
 	default:
 		return -EINVAL;
 	}
 
-	if (IS_ERR(iod))
-		return PTR_ERR(iod);
+	length = (io.nblocks + 1) << ns->lba_shift;
+	meta_len = (io.nblocks + 1) * ns->ms;
+	write = io.opcode & 1;
+	metadata = (void __user *)(unsigned long)io.metadata;
 
-	prp_len = nvme_setup_prps(dev, iod, length, GFP_KERNEL);
-	if (length != prp_len) {
-		status = -ENOMEM;
-		goto unmap;
-	}
 	if (meta_len) {
-		meta = dma_alloc_coherent(&dev->pci_dev->dev, meta_len,
+		if (((io.metadata & 3) || !io.metadata) && !ns->ext)
+			return -EINVAL;
+
+		if (ns->ext) {
+			length += meta_len;
+			meta_len = 0;
+		}
+
+		meta = dma_alloc_coherent(dev->dev, meta_len,
 						&meta_dma, GFP_KERNEL);
 
 		if (!meta) {
@@ -1813,19 +1772,17 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	c.rw.reftag = cpu_to_le32(io.reftag);
 	c.rw.apptag = cpu_to_le16(io.apptag);
 	c.rw.appmask = cpu_to_le16(io.appmask);
-	c.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
-	c.rw.prp2 = cpu_to_le64(iod->first_dma);
 	c.rw.metadata = cpu_to_le64(meta_dma);
-	status = nvme_submit_io_cmd(dev, ns, &c, NULL);
+
+	status = __nvme_submit_sync_cmd(ns->queue, &c, NULL,
+			(void __user *)io.addr, length, NULL, 0);
  unmap:
-	nvme_unmap_user_pages(dev, write, iod);
-	nvme_free_iod(dev, iod);
 	if (meta) {
 		if (status == NVME_SC_SUCCESS && !write) {
 			if (copy_to_user(metadata, meta, meta_len))
 				status = -EFAULT;
 		}
-		dma_free_coherent(&dev->pci_dev->dev, meta_len, meta, meta_dma);
+		dma_free_coherent(dev->dev, meta_len, meta, meta_dma);
 	}
 	return status;
 }
@@ -1835,9 +1792,8 @@ static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
 {
 	struct nvme_passthru_cmd cmd;
 	struct nvme_command c;
-	int status, length;
-	struct nvme_iod *uninitialized_var(iod);
-	unsigned timeout;
+	unsigned timeout = 0;
+	int status;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
@@ -1857,46 +1813,17 @@ static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
 	c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
 	c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
 
-	length = cmd.data_len;
-	if (cmd.data_len) {
-		iod = nvme_map_user_pages(dev, cmd.opcode & 1, cmd.addr,
-								length);
-		if (IS_ERR(iod))
-			return PTR_ERR(iod);
-		length = nvme_setup_prps(dev, iod, length, GFP_KERNEL);
-		c.common.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
-		c.common.prp2 = cpu_to_le64(iod->first_dma);
-	}
-
-	timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) :
-								ADMIN_TIMEOUT;
-
-	if (length != cmd.data_len)
-		status = -ENOMEM;
-	else if (ns) {
-		struct request *req;
-
-		req = blk_mq_alloc_request(ns->queue, WRITE,
-						(GFP_KERNEL|__GFP_WAIT), false);
-		if (IS_ERR(req))
-			status = PTR_ERR(req);
-		else {
-			status = nvme_submit_sync_cmd(req, &c, &cmd.result,
-								timeout);
-			blk_mq_free_request(req);
-		}
-	} else
-		status = __nvme_submit_admin_cmd(dev, &c, &cmd.result, timeout);
+	if (cmd.timeout_ms)
+		timeout = msecs_to_jiffies(cmd.timeout_ms);
 
-	if (cmd.data_len) {
-		nvme_unmap_user_pages(dev, cmd.opcode & 1, iod);
-		nvme_free_iod(dev, iod);
+	status = __nvme_submit_sync_cmd(ns ? ns->queue : dev->admin_q, &c,
+			NULL, (void __user *)cmd.addr, cmd.data_len,
+			&cmd.result, timeout);
+	if (status >= 0) {
+		if (put_user(cmd.result, &ucmd->result))
+			return -EFAULT;
 	}
 
-	if ((status >= 0) && copy_to_user(&ucmd->result, &cmd.result,
-							sizeof(cmd.result)))
-		status = -EFAULT;
-
 	return status;
 }
 
@@ -1988,24 +1915,14 @@ static int nvme_revalidate_disk(struct gendisk *disk)
 	struct nvme_ns *ns = disk->private_data;
 	struct nvme_dev *dev = ns->dev;
 	struct nvme_id_ns *id;
-	dma_addr_t dma_addr;
 	u8 lbaf, pi_type;
 	u16 old_ms;
 	unsigned short bs;
 
-	id = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr,
-								GFP_KERNEL);
-	if (!id) {
-		dev_warn(&dev->pci_dev->dev, "%s: Memory alocation failure\n",
-								__func__);
+	if (nvme_identify_ns(dev, ns->ns_id, &id)) {
+		dev_warn(dev->dev, "%s: Identify failure\n", __func__);
 		return 0;
 	}
-	if (nvme_identify(dev, ns->ns_id, 0, dma_addr)) {
-		dev_warn(&dev->pci_dev->dev,
-			"identify failed ns:%d, setting capacity to 0\n",
-			ns->ns_id);
-		memset(id, 0, sizeof(*id));
-	}
 
 	old_ms = ns->ms;
 	lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
@@ -2046,7 +1963,7 @@ static int nvme_revalidate_disk(struct gendisk *disk)
 	if (dev->oncs & NVME_CTRL_ONCS_DSM)
 		nvme_config_discard(ns);
 
-	dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr);
+	kfree(id);
 	return 0;
 }
 
@@ -2073,7 +1990,7 @@ static int nvme_kthread(void *data)
 				if (work_busy(&dev->reset_work))
 					continue;
 				list_del_init(&dev->node);
-				dev_warn(&dev->pci_dev->dev,
+				dev_warn(dev->dev,
 					"Failed status: %x, reset controller\n",
 					readl(&dev->bar->csts));
 				dev->reset_workfn = nvme_reset_failed_dev;
@@ -2105,7 +2022,7 @@ static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid)
 {
 	struct nvme_ns *ns;
 	struct gendisk *disk;
-	int node = dev_to_node(&dev->pci_dev->dev);
+	int node = dev_to_node(dev->dev);
 
 	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
 	if (!ns)
@@ -2188,8 +2105,7 @@ static int set_queue_count(struct nvme_dev *dev, int count)
 	if (status < 0)
 		return status;
 	if (status > 0) {
-		dev_err(&dev->pci_dev->dev, "Could not set queue count (%d)\n",
-									status);
+		dev_err(dev->dev, "Could not set queue count (%d)\n", status);
 		return 0;
 	}
 	return min(result & 0xffff, result >> 16) + 1;
@@ -2203,7 +2119,7 @@ static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
 static int nvme_setup_io_queues(struct nvme_dev *dev)
 {
 	struct nvme_queue *adminq = dev->queues[0];
-	struct pci_dev *pdev = dev->pci_dev;
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
 	int result, i, vecs, nr_io_queues, size;
 
 	nr_io_queues = num_possible_cpus();
@@ -2283,26 +2199,18 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
  */
 static int nvme_dev_add(struct nvme_dev *dev)
 {
-	struct pci_dev *pdev = dev->pci_dev;
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
 	int res;
 	unsigned nn, i;
 	struct nvme_id_ctrl *ctrl;
-	void *mem;
-	dma_addr_t dma_addr;
 	int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12;
 
-	mem = dma_alloc_coherent(&pdev->dev, 4096, &dma_addr, GFP_KERNEL);
-	if (!mem)
-		return -ENOMEM;
-
-	res = nvme_identify(dev, 0, 1, dma_addr);
+	res = nvme_identify_ctrl(dev, &ctrl);
 	if (res) {
-		dev_err(&pdev->dev, "Identify Controller failed (%d)\n", res);
-		dma_free_coherent(&dev->pci_dev->dev, 4096, mem, dma_addr);
+		dev_err(dev->dev, "Identify Controller failed (%d)\n", res);
 		return -EIO;
 	}
 
-	ctrl = mem;
 	nn = le32_to_cpup(&ctrl->nn);
 	dev->oncs = le16_to_cpup(&ctrl->oncs);
 	dev->abort_limit = ctrl->acl + 1;
@@ -2324,12 +2232,12 @@ static int nvme_dev_add(struct nvme_dev *dev)
 		} else
 			dev->max_hw_sectors = max_hw_sectors;
 	}
-	dma_free_coherent(&dev->pci_dev->dev, 4096, mem, dma_addr);
+	kfree(ctrl);
 
 	dev->tagset.ops = &nvme_mq_ops;
 	dev->tagset.nr_hw_queues = dev->online_queues - 1;
 	dev->tagset.timeout = NVME_IO_TIMEOUT;
-	dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
+	dev->tagset.numa_node = dev_to_node(dev->dev);
 	dev->tagset.queue_depth =
 				min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
 	dev->tagset.cmd_size = nvme_cmd_size(dev);
@@ -2349,7 +2257,7 @@ static int nvme_dev_map(struct nvme_dev *dev)
 {
 	u64 cap;
 	int bars, result = -ENOMEM;
-	struct pci_dev *pdev = dev->pci_dev;
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
 
 	if (pci_enable_device_mem(pdev))
 		return result;
@@ -2363,8 +2271,8 @@ static int nvme_dev_map(struct nvme_dev *dev)
 	if (pci_request_selected_regions(pdev, bars, "nvme"))
 		goto disable_pci;
 
-	if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)) &&
-	    dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)))
+	if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) &&
+	    dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32)))
 		goto disable;
 
 	dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
@@ -2405,19 +2313,21 @@ static int nvme_dev_map(struct nvme_dev *dev)
 
 static void nvme_dev_unmap(struct nvme_dev *dev)
 {
-	if (dev->pci_dev->msi_enabled)
-		pci_disable_msi(dev->pci_dev);
-	else if (dev->pci_dev->msix_enabled)
-		pci_disable_msix(dev->pci_dev);
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
+
+	if (pdev->msi_enabled)
+		pci_disable_msi(pdev);
+	else if (pdev->msix_enabled)
+		pci_disable_msix(pdev);
 
 	if (dev->bar) {
 		iounmap(dev->bar);
 		dev->bar = NULL;
-		pci_release_regions(dev->pci_dev);
+		pci_release_regions(pdev);
 	}
 
-	if (pci_is_enabled(dev->pci_dev))
-		pci_disable_device(dev->pci_dev);
+	if (pci_is_enabled(pdev))
+		pci_disable_device(pdev);
 }
 
 struct nvme_delq_ctx {
@@ -2536,7 +2446,7 @@ static void nvme_disable_io_queues(struct nvme_dev *dev)
 					&worker, "nvme%d", dev->instance);
 
 	if (IS_ERR(kworker_task)) {
-		dev_err(&dev->pci_dev->dev,
+		dev_err(dev->dev,
 			"Failed to create queue del task\n");
 		for (i = dev->queue_count - 1; i > 0; i--)
 			nvme_disable_queue(dev, i);
@@ -2587,9 +2497,9 @@ static void nvme_freeze_queues(struct nvme_dev *dev)
 	list_for_each_entry(ns, &dev->namespaces, list) {
 		blk_mq_freeze_queue_start(ns->queue);
 
-		spin_lock(ns->queue->queue_lock);
+		spin_lock_irq(ns->queue->queue_lock);
 		queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue);
-		spin_unlock(ns->queue->queue_lock);
+		spin_unlock_irq(ns->queue->queue_lock);
 
 		blk_mq_cancel_requeue_work(ns->queue);
 		blk_mq_stop_hw_queues(ns->queue);
@@ -2654,14 +2564,13 @@ static void nvme_dev_remove(struct nvme_dev *dev)
 
 static int nvme_setup_prp_pools(struct nvme_dev *dev)
 {
-	struct device *dmadev = &dev->pci_dev->dev;
-	dev->prp_page_pool = dma_pool_create("prp list page", dmadev,
+	dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
 						PAGE_SIZE, PAGE_SIZE, 0);
 	if (!dev->prp_page_pool)
 		return -ENOMEM;
 
 	/* Optimisation for I/Os between 4k and 128k */
-	dev->prp_small_pool = dma_pool_create("prp list 256", dmadev,
+	dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
 						256, 256, 0);
 	if (!dev->prp_small_pool) {
 		dma_pool_destroy(dev->prp_page_pool);
@@ -2725,7 +2634,7 @@ static void nvme_free_dev(struct kref *kref)
 {
 	struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref);
 
-	pci_dev_put(dev->pci_dev);
+	put_device(dev->dev);
 	put_device(dev->device);
 	nvme_free_namespaces(dev);
 	nvme_release_instance(dev);
@@ -2802,11 +2711,11 @@ static void nvme_set_irq_hints(struct nvme_dev *dev)
 	for (i = 0; i < dev->online_queues; i++) {
 		nvmeq = dev->queues[i];
 
-		if (!nvmeq->hctx)
+		if (!nvmeq->tags || !(*nvmeq->tags))
 			continue;
 
 		irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
-							nvmeq->hctx->cpumask);
+					blk_mq_tags_cpumask(*nvmeq->tags));
 	}
 }
 
@@ -2869,7 +2778,7 @@ static int nvme_dev_start(struct nvme_dev *dev)
 static int nvme_remove_dead_ctrl(void *arg)
 {
 	struct nvme_dev *dev = (struct nvme_dev *)arg;
-	struct pci_dev *pdev = dev->pci_dev;
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
 
 	if (pci_get_drvdata(pdev))
 		pci_stop_and_remove_bus_device_locked(pdev);
@@ -2908,11 +2817,11 @@ static void nvme_dev_reset(struct nvme_dev *dev)
 {
 	nvme_dev_shutdown(dev);
 	if (nvme_dev_resume(dev)) {
-		dev_warn(&dev->pci_dev->dev, "Device failed to resume\n");
+		dev_warn(dev->dev, "Device failed to resume\n");
 		kref_get(&dev->kref);
 		if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d",
 							dev->instance))) {
-			dev_err(&dev->pci_dev->dev,
+			dev_err(dev->dev,
 				"Failed to start controller remove task\n");
 			kref_put(&dev->kref, nvme_free_dev);
 		}
@@ -2956,7 +2865,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	INIT_LIST_HEAD(&dev->namespaces);
 	dev->reset_workfn = nvme_reset_failed_dev;
 	INIT_WORK(&dev->reset_work, nvme_reset_workfn);
-	dev->pci_dev = pci_dev_get(pdev);
+	dev->dev = get_device(&pdev->dev);
 	pci_set_drvdata(pdev, dev);
 	result = nvme_set_instance(dev);
 	if (result)
@@ -2986,7 +2895,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
  release:
 	nvme_release_instance(dev);
  put_pci:
-	pci_dev_put(dev->pci_dev);
+	put_device(dev->dev);
  free:
 	kfree(dev->queues);
 	kfree(dev->entry);
diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c
index 44f2514fb7755d..a0fade9c6bd275 100644
--- a/drivers/block/nvme-scsi.c
+++ b/drivers/block/nvme-scsi.c
@@ -41,15 +41,13 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/types.h>
+#include <asm/unaligned.h>
 #include <scsi/sg.h>
 #include <scsi/scsi.h>
 
 
 static int sg_version_num = 30534;	/* 2 digits for each component */
 
-#define SNTI_TRANSLATION_SUCCESS			0
-#define SNTI_INTERNAL_ERROR				1
-
 /* VPD Page Codes */
 #define VPD_SUPPORTED_PAGES				0x00
 #define VPD_SERIAL_NUMBER				0x80
@@ -58,49 +56,14 @@ static int sg_version_num = 30534;	/* 2 digits for each component */
 #define VPD_BLOCK_LIMITS				0xB0
 #define VPD_BLOCK_DEV_CHARACTERISTICS			0xB1
 
-/* CDB offsets */
-#define REPORT_LUNS_CDB_ALLOC_LENGTH_OFFSET		6
-#define REPORT_LUNS_SR_OFFSET				2
-#define READ_CAP_16_CDB_ALLOC_LENGTH_OFFSET		10
-#define REQUEST_SENSE_CDB_ALLOC_LENGTH_OFFSET		4
-#define REQUEST_SENSE_DESC_OFFSET			1
-#define REQUEST_SENSE_DESC_MASK				0x01
-#define DESCRIPTOR_FORMAT_SENSE_DATA_TYPE		1
-#define INQUIRY_EVPD_BYTE_OFFSET			1
-#define INQUIRY_PAGE_CODE_BYTE_OFFSET			2
-#define INQUIRY_EVPD_BIT_MASK				1
-#define INQUIRY_CDB_ALLOCATION_LENGTH_OFFSET		3
-#define START_STOP_UNIT_CDB_IMMED_OFFSET		1
-#define START_STOP_UNIT_CDB_IMMED_MASK			0x1
-#define START_STOP_UNIT_CDB_POWER_COND_MOD_OFFSET	3
-#define START_STOP_UNIT_CDB_POWER_COND_MOD_MASK		0xF
-#define START_STOP_UNIT_CDB_POWER_COND_OFFSET		4
-#define START_STOP_UNIT_CDB_POWER_COND_MASK		0xF0
-#define START_STOP_UNIT_CDB_NO_FLUSH_OFFSET		4
-#define START_STOP_UNIT_CDB_NO_FLUSH_MASK		0x4
-#define START_STOP_UNIT_CDB_START_OFFSET		4
-#define START_STOP_UNIT_CDB_START_MASK			0x1
-#define WRITE_BUFFER_CDB_MODE_OFFSET			1
-#define WRITE_BUFFER_CDB_MODE_MASK			0x1F
-#define WRITE_BUFFER_CDB_BUFFER_ID_OFFSET		2
-#define WRITE_BUFFER_CDB_BUFFER_OFFSET_OFFSET		3
-#define WRITE_BUFFER_CDB_PARM_LIST_LENGTH_OFFSET	6
-#define FORMAT_UNIT_CDB_FORMAT_PROT_INFO_OFFSET		1
-#define FORMAT_UNIT_CDB_FORMAT_PROT_INFO_MASK		0xC0
-#define FORMAT_UNIT_CDB_FORMAT_PROT_INFO_SHIFT		6
-#define FORMAT_UNIT_CDB_LONG_LIST_OFFSET		1
-#define FORMAT_UNIT_CDB_LONG_LIST_MASK			0x20
-#define FORMAT_UNIT_CDB_FORMAT_DATA_OFFSET		1
-#define FORMAT_UNIT_CDB_FORMAT_DATA_MASK		0x10
+/* format unit paramter list offsets */
 #define FORMAT_UNIT_SHORT_PARM_LIST_LEN			4
 #define FORMAT_UNIT_LONG_PARM_LIST_LEN			8
 #define FORMAT_UNIT_PROT_INT_OFFSET			3
 #define FORMAT_UNIT_PROT_FIELD_USAGE_OFFSET		0
 #define FORMAT_UNIT_PROT_FIELD_USAGE_MASK		0x07
-#define UNMAP_CDB_PARAM_LIST_LENGTH_OFFSET		7
 
 /* Misc. defines */
-#define NIBBLE_SHIFT					4
 #define FIXED_SENSE_DATA				0x70
 #define DESC_FORMAT_SENSE_DATA				0x72
 #define FIXED_SENSE_DATA_ADD_LENGTH			10
@@ -144,27 +107,6 @@ static int sg_version_num = 30534;	/* 2 digits for each component */
 #define EXTENDED_INQUIRY_DATA_PAGE_LENGTH		0x3C
 #define RESERVED_FIELD					0
 
-/* SCSI READ/WRITE Defines */
-#define IO_CDB_WP_MASK					0xE0
-#define IO_CDB_WP_SHIFT					5
-#define IO_CDB_FUA_MASK					0x8
-#define IO_6_CDB_LBA_OFFSET				0
-#define IO_6_CDB_LBA_MASK				0x001FFFFF
-#define IO_6_CDB_TX_LEN_OFFSET				4
-#define IO_6_DEFAULT_TX_LEN				256
-#define IO_10_CDB_LBA_OFFSET				2
-#define IO_10_CDB_TX_LEN_OFFSET				7
-#define IO_10_CDB_WP_OFFSET				1
-#define IO_10_CDB_FUA_OFFSET				1
-#define IO_12_CDB_LBA_OFFSET				2
-#define IO_12_CDB_TX_LEN_OFFSET				6
-#define IO_12_CDB_WP_OFFSET				1
-#define IO_12_CDB_FUA_OFFSET				1
-#define IO_16_CDB_FUA_OFFSET				1
-#define IO_16_CDB_WP_OFFSET				1
-#define IO_16_CDB_LBA_OFFSET				2
-#define IO_16_CDB_TX_LEN_OFFSET				10
-
 /* Mode Sense/Select defines */
 #define MODE_PAGE_INFO_EXCEP				0x1C
 #define MODE_PAGE_CACHING				0x08
@@ -179,23 +121,14 @@ static int sg_version_num = 30534;	/* 2 digits for each component */
 #define MODE_PAGE_INF_EXC_LEN				0x0C
 #define MODE_PAGE_ALL_LEN				0x54
 #define MODE_SENSE6_MPH_SIZE				4
-#define MODE_SENSE6_ALLOC_LEN_OFFSET			4
-#define MODE_SENSE_PAGE_CONTROL_OFFSET			2
 #define MODE_SENSE_PAGE_CONTROL_MASK			0xC0
 #define MODE_SENSE_PAGE_CODE_OFFSET			2
 #define MODE_SENSE_PAGE_CODE_MASK			0x3F
-#define MODE_SENSE_LLBAA_OFFSET				1
 #define MODE_SENSE_LLBAA_MASK				0x10
 #define MODE_SENSE_LLBAA_SHIFT				4
-#define MODE_SENSE_DBD_OFFSET				1
 #define MODE_SENSE_DBD_MASK				8
 #define MODE_SENSE_DBD_SHIFT				3
 #define MODE_SENSE10_MPH_SIZE				8
-#define MODE_SENSE10_ALLOC_LEN_OFFSET			7
-#define MODE_SELECT_CDB_PAGE_FORMAT_OFFSET		1
-#define MODE_SELECT_CDB_SAVE_PAGES_OFFSET		1
-#define MODE_SELECT_6_CDB_PARAM_LIST_LENGTH_OFFSET	4
-#define MODE_SELECT_10_CDB_PARAM_LIST_LENGTH_OFFSET	7
 #define MODE_SELECT_CDB_PAGE_FORMAT_MASK		0x10
 #define MODE_SELECT_CDB_SAVE_PAGES_MASK			0x1
 #define MODE_SELECT_6_BD_OFFSET				3
@@ -221,14 +154,11 @@ static int sg_version_num = 30534;	/* 2 digits for each component */
 #define LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH		0x07
 #define LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE		0x2F
 #define LOG_PAGE_TEMPERATURE_PAGE			0x0D
-#define LOG_SENSE_CDB_SP_OFFSET				1
 #define LOG_SENSE_CDB_SP_NOT_ENABLED			0
-#define LOG_SENSE_CDB_PC_OFFSET				2
 #define LOG_SENSE_CDB_PC_MASK				0xC0
 #define LOG_SENSE_CDB_PC_SHIFT				6
 #define LOG_SENSE_CDB_PC_CUMULATIVE_VALUES		1
 #define LOG_SENSE_CDB_PAGE_CODE_MASK			0x3F
-#define LOG_SENSE_CDB_ALLOC_LENGTH_OFFSET		7
 #define REMAINING_INFO_EXCP_PAGE_LENGTH			0x8
 #define LOG_INFO_EXCP_PAGE_LENGTH			0xC
 #define REMAINING_TEMP_PAGE_LENGTH			0xC
@@ -278,77 +208,11 @@ static int sg_version_num = 30534;	/* 2 digits for each component */
 #define SCSI_ASCQ_POWER_LOSS_EXPECTED			0x08
 #define SCSI_ASCQ_INVALID_LUN_ID			0x09
 
-/**
- * DEVICE_SPECIFIC_PARAMETER in mode parameter header (see sbc2r16) to
- * enable DPOFUA support type 0x10 value.
- */
-#define DEVICE_SPECIFIC_PARAMETER			0
-#define VPD_ID_DESCRIPTOR_LENGTH sizeof(VPD_IDENTIFICATION_DESCRIPTOR)
-
-/* MACROs to extract information from CDBs */
-
-#define GET_OPCODE(cdb)		cdb[0]
-
-#define GET_U8_FROM_CDB(cdb, index) (cdb[index] << 0)
-
-#define GET_U16_FROM_CDB(cdb, index) ((cdb[index] << 8) | (cdb[index + 1] << 0))
-
-#define GET_U24_FROM_CDB(cdb, index) ((cdb[index] << 16) | \
-(cdb[index + 1] <<  8) | \
-(cdb[index + 2] <<  0))
-
-#define GET_U32_FROM_CDB(cdb, index) ((cdb[index] << 24) | \
-(cdb[index + 1] << 16) | \
-(cdb[index + 2] <<  8) | \
-(cdb[index + 3] <<  0))
-
-#define GET_U64_FROM_CDB(cdb, index) ((((u64)cdb[index]) << 56) | \
-(((u64)cdb[index + 1]) << 48) | \
-(((u64)cdb[index + 2]) << 40) | \
-(((u64)cdb[index + 3]) << 32) | \
-(((u64)cdb[index + 4]) << 24) | \
-(((u64)cdb[index + 5]) << 16) | \
-(((u64)cdb[index + 6]) <<  8) | \
-(((u64)cdb[index + 7]) <<  0))
-
-/* Inquiry Helper Macros */
-#define GET_INQ_EVPD_BIT(cdb) \
-((GET_U8_FROM_CDB(cdb, INQUIRY_EVPD_BYTE_OFFSET) &		\
-INQUIRY_EVPD_BIT_MASK) ? 1 : 0)
-
-#define GET_INQ_PAGE_CODE(cdb)					\
-(GET_U8_FROM_CDB(cdb, INQUIRY_PAGE_CODE_BYTE_OFFSET))
-
-#define GET_INQ_ALLOC_LENGTH(cdb)				\
-(GET_U16_FROM_CDB(cdb, INQUIRY_CDB_ALLOCATION_LENGTH_OFFSET))
-
-/* Report LUNs Helper Macros */
-#define GET_REPORT_LUNS_ALLOC_LENGTH(cdb)			\
-(GET_U32_FROM_CDB(cdb, REPORT_LUNS_CDB_ALLOC_LENGTH_OFFSET))
-
-/* Read Capacity Helper Macros */
-#define GET_READ_CAP_16_ALLOC_LENGTH(cdb)			\
-(GET_U32_FROM_CDB(cdb, READ_CAP_16_CDB_ALLOC_LENGTH_OFFSET))
-
-#define IS_READ_CAP_16(cdb)					\
-((cdb[0] == SERVICE_ACTION_IN_16 && cdb[1] == SAI_READ_CAPACITY_16) ? 1 : 0)
-
-/* Request Sense Helper Macros */
-#define GET_REQUEST_SENSE_ALLOC_LENGTH(cdb)			\
-(GET_U8_FROM_CDB(cdb, REQUEST_SENSE_CDB_ALLOC_LENGTH_OFFSET))
-
-/* Mode Sense Helper Macros */
-#define GET_MODE_SENSE_DBD(cdb)					\
-((GET_U8_FROM_CDB(cdb, MODE_SENSE_DBD_OFFSET) & MODE_SENSE_DBD_MASK) >>	\
-MODE_SENSE_DBD_SHIFT)
-
-#define GET_MODE_SENSE_LLBAA(cdb)				\
-((GET_U8_FROM_CDB(cdb, MODE_SENSE_LLBAA_OFFSET) &		\
-MODE_SENSE_LLBAA_MASK) >> MODE_SENSE_LLBAA_SHIFT)
-
-#define GET_MODE_SENSE_MPH_SIZE(cdb10)				\
-(cdb10 ? MODE_SENSE10_MPH_SIZE : MODE_SENSE6_MPH_SIZE)
-
+/* copied from drivers/usb/gadget/function/storage_common.h */
+static inline u32 get_unaligned_be24(u8 *buf)
+{
+	return 0xffffff & (u32) get_unaligned_be32(buf - 1);
+}
 
 /* Struct to gather data that needs to be extracted from a SCSI CDB.
    Not conforming to any particular CDB variant, but compatible with all. */
@@ -369,8 +233,6 @@ struct nvme_trans_io_cdb {
 static int nvme_trans_copy_to_user(struct sg_io_hdr *hdr, void *from,
 								unsigned long n)
 {
-	int res = SNTI_TRANSLATION_SUCCESS;
-	unsigned long not_copied;
 	int i;
 	void *index = from;
 	size_t remaining = n;
@@ -380,29 +242,25 @@ static int nvme_trans_copy_to_user(struct sg_io_hdr *hdr, void *from,
 		struct sg_iovec sgl;
 
 		for (i = 0; i < hdr->iovec_count; i++) {
-			not_copied = copy_from_user(&sgl, hdr->dxferp +
+			if (copy_from_user(&sgl, hdr->dxferp +
 						i * sizeof(struct sg_iovec),
-						sizeof(struct sg_iovec));
-			if (not_copied)
+						sizeof(struct sg_iovec)))
 				return -EFAULT;
 			xfer_len = min(remaining, sgl.iov_len);
-			not_copied = copy_to_user(sgl.iov_base, index,
-								xfer_len);
-			if (not_copied) {
-				res = -EFAULT;
-				break;
-			}
+			if (copy_to_user(sgl.iov_base, index, xfer_len))
+				return -EFAULT;
+
 			index += xfer_len;
 			remaining -= xfer_len;
 			if (remaining == 0)
 				break;
 		}
-		return res;
+		return 0;
 	}
-	not_copied = copy_to_user(hdr->dxferp, from, n);
-	if (not_copied)
-		res = -EFAULT;
-	return res;
+
+	if (copy_to_user(hdr->dxferp, from, n))
+		return -EFAULT;
+	return 0;
 }
 
 /* Copy data from userspace memory */
@@ -410,8 +268,6 @@ static int nvme_trans_copy_to_user(struct sg_io_hdr *hdr, void *from,
 static int nvme_trans_copy_from_user(struct sg_io_hdr *hdr, void *to,
 								unsigned long n)
 {
-	int res = SNTI_TRANSLATION_SUCCESS;
-	unsigned long not_copied;
 	int i;
 	void *index = to;
 	size_t remaining = n;
@@ -421,30 +277,24 @@ static int nvme_trans_copy_from_user(struct sg_io_hdr *hdr, void *to,
 		struct sg_iovec sgl;
 
 		for (i = 0; i < hdr->iovec_count; i++) {
-			not_copied = copy_from_user(&sgl, hdr->dxferp +
+			if (copy_from_user(&sgl, hdr->dxferp +
 						i * sizeof(struct sg_iovec),
-						sizeof(struct sg_iovec));
-			if (not_copied)
+						sizeof(struct sg_iovec)))
 				return -EFAULT;
 			xfer_len = min(remaining, sgl.iov_len);
-			not_copied = copy_from_user(index, sgl.iov_base,
-								xfer_len);
-			if (not_copied) {
-				res = -EFAULT;
-				break;
-			}
+			if (copy_from_user(index, sgl.iov_base, xfer_len))
+				return -EFAULT;
 			index += xfer_len;
 			remaining -= xfer_len;
 			if (remaining == 0)
 				break;
 		}
-		return res;
+		return 0;
 	}
 
-	not_copied = copy_from_user(to, hdr->dxferp, n);
-	if (not_copied)
-		res = -EFAULT;
-	return res;
+	if (copy_from_user(to, hdr->dxferp, n))
+		return -EFAULT;
+	return 0;
 }
 
 /* Status/Sense Buffer Writeback */
@@ -452,7 +302,6 @@ static int nvme_trans_copy_from_user(struct sg_io_hdr *hdr, void *to,
 static int nvme_trans_completion(struct sg_io_hdr *hdr, u8 status, u8 sense_key,
 				 u8 asc, u8 ascq)
 {
-	int res = SNTI_TRANSLATION_SUCCESS;
 	u8 xfer_len;
 	u8 resp[DESC_FMT_SENSE_DATA_SIZE];
 
@@ -477,25 +326,29 @@ static int nvme_trans_completion(struct sg_io_hdr *hdr, u8 status, u8 sense_key,
 		xfer_len = min_t(u8, hdr->mx_sb_len, DESC_FMT_SENSE_DATA_SIZE);
 		hdr->sb_len_wr = xfer_len;
 		if (copy_to_user(hdr->sbp, resp, xfer_len) > 0)
-			res = -EFAULT;
+			return -EFAULT;
 	}
 
-	return res;
+	return 0;
 }
 
+/*
+ * Take a status code from a lowlevel routine, and if it was a positive NVMe
+ * error code update the sense data based on it.  In either case the passed
+ * in value is returned again, unless an -EFAULT from copy_to_user overrides
+ * it.
+ */
 static int nvme_trans_status_code(struct sg_io_hdr *hdr, int nvme_sc)
 {
 	u8 status, sense_key, asc, ascq;
-	int res = SNTI_TRANSLATION_SUCCESS;
+	int res;
 
 	/* For non-nvme (Linux) errors, simply return the error code */
 	if (nvme_sc < 0)
 		return nvme_sc;
 
 	/* Mask DNR, More, and reserved fields */
-	nvme_sc &= 0x7FF;
-
-	switch (nvme_sc) {
+	switch (nvme_sc & 0x7FF) {
 	/* Generic Command Status */
 	case NVME_SC_SUCCESS:
 		status = SAM_STAT_GOOD;
@@ -662,8 +515,7 @@ static int nvme_trans_status_code(struct sg_io_hdr *hdr, int nvme_sc)
 	}
 
 	res = nvme_trans_completion(hdr, status, sense_key, asc, ascq);
-
-	return res;
+	return res ? res : nvme_sc;
 }
 
 /* INQUIRY Helper Functions */
@@ -673,10 +525,8 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns,
 					int alloc_len)
 {
 	struct nvme_dev *dev = ns->dev;
-	dma_addr_t dma_addr;
-	void *mem;
 	struct nvme_id_ns *id_ns;
-	int res = SNTI_TRANSLATION_SUCCESS;
+	int res;
 	int nvme_sc;
 	int xfer_len;
 	u8 resp_data_format = 0x02;
@@ -684,31 +534,17 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns,
 	u8 cmdque = 0x01 << 1;
 	u8 fw_offset = sizeof(dev->firmware_rev);
 
-	mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
-				&dma_addr, GFP_KERNEL);
-	if (mem == NULL) {
-		res = -ENOMEM;
-		goto out_dma;
-	}
-
 	/* nvme ns identify - use DPS value for PROTECT field */
-	nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+	nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns);
 	res = nvme_trans_status_code(hdr, nvme_sc);
-	/*
-	 * If nvme_sc was -ve, res will be -ve here.
-	 * If nvme_sc was +ve, the status would bace been translated, and res
-	 *  can only be 0 or -ve.
-	 *    - If 0 && nvme_sc > 0, then go into next if where res gets nvme_sc
-	 *    - If -ve, return because its a Linux error.
-	 */
 	if (res)
-		goto out_free;
-	if (nvme_sc) {
-		res = nvme_sc;
-		goto out_free;
-	}
-	id_ns = mem;
-	(id_ns->dps) ? (protect = 0x01) : (protect = 0);
+		return res;
+
+	if (id_ns->dps)
+		protect = 0x01;
+	else
+		protect = 0;
+	kfree(id_ns);
 
 	memset(inq_response, 0, STANDARD_INQUIRY_LENGTH);
 	inq_response[2] = VERSION_SPC_4;
@@ -725,20 +561,13 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns,
 	strncpy(&inq_response[32], dev->firmware_rev + fw_offset, 4);
 
 	xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH);
-	res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
-
- out_free:
-	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem,
-			  dma_addr);
- out_dma:
-	return res;
+	return nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
 }
 
 static int nvme_trans_supported_vpd_pages(struct nvme_ns *ns,
 					struct sg_io_hdr *hdr, u8 *inq_response,
 					int alloc_len)
 {
-	int res = SNTI_TRANSLATION_SUCCESS;
 	int xfer_len;
 
 	memset(inq_response, 0, STANDARD_INQUIRY_LENGTH);
@@ -752,9 +581,7 @@ static int nvme_trans_supported_vpd_pages(struct nvme_ns *ns,
 	inq_response[9] = INQ_BDEV_LIMITS_PAGE;
 
 	xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH);
-	res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
-
-	return res;
+	return nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
 }
 
 static int nvme_trans_unit_serial_page(struct nvme_ns *ns,
@@ -762,7 +589,6 @@ static int nvme_trans_unit_serial_page(struct nvme_ns *ns,
 					int alloc_len)
 {
 	struct nvme_dev *dev = ns->dev;
-	int res = SNTI_TRANSLATION_SUCCESS;
 	int xfer_len;
 
 	memset(inq_response, 0, STANDARD_INQUIRY_LENGTH);
@@ -771,53 +597,42 @@ static int nvme_trans_unit_serial_page(struct nvme_ns *ns,
 	strncpy(&inq_response[4], dev->serial, INQ_SERIAL_NUMBER_LENGTH);
 
 	xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH);
-	res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
-
-	return res;
+	return nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
 }
 
 static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 					u8 *inq_response, int alloc_len)
 {
 	struct nvme_dev *dev = ns->dev;
-	dma_addr_t dma_addr;
-	void *mem;
-	int res = SNTI_TRANSLATION_SUCCESS;
+	int res;
 	int nvme_sc;
 	int xfer_len;
 	__be32 tmp_id = cpu_to_be32(ns->ns_id);
 
-	mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
-					&dma_addr, GFP_KERNEL);
-	if (mem == NULL) {
-		res = -ENOMEM;
-		goto out_dma;
-	}
-
 	memset(inq_response, 0, alloc_len);
 	inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE;    /* Page Code */
 	if (readl(&dev->bar->vs) >= NVME_VS(1, 1)) {
-		struct nvme_id_ns *id_ns = mem;
-		void *eui = id_ns->eui64;
-		int len = sizeof(id_ns->eui64);
+		struct nvme_id_ns *id_ns;
+		void *eui;
+		int len;
 
-		nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+		nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns);
 		res = nvme_trans_status_code(hdr, nvme_sc);
 		if (res)
-			goto out_free;
-		if (nvme_sc) {
-			res = nvme_sc;
-			goto out_free;
-		}
+			return res;
 
+		eui = id_ns->eui64;
+		len = sizeof(id_ns->eui64);
 		if (readl(&dev->bar->vs) >= NVME_VS(1, 2)) {
 			if (bitmap_empty(eui, len * 8)) {
 				eui = id_ns->nguid;
 				len = sizeof(id_ns->nguid);
 			}
 		}
-		if (bitmap_empty(eui, len * 8))
+		if (bitmap_empty(eui, len * 8)) {
+			kfree(id_ns);
 			goto scsi_string;
+		}
 
 		inq_response[3] = 4 + len; /* Page Length */
 		/* Designation Descriptor start */
@@ -826,14 +641,14 @@ static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 		inq_response[6] = 0x00;    /* Rsvd */
 		inq_response[7] = len;     /* Designator Length */
 		memcpy(&inq_response[8], eui, len);
+		kfree(id_ns);
 	} else {
  scsi_string:
 		if (alloc_len < 72) {
-			res = nvme_trans_completion(hdr,
+			return nvme_trans_completion(hdr,
 					SAM_STAT_CHECK_CONDITION,
 					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
 					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
-			goto out_free;
 		}
 		inq_response[3] = 0x48;    /* Page Length */
 		/* Designation Descriptor start */
@@ -842,30 +657,22 @@ static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 		inq_response[6] = 0x00;    /* Rsvd */
 		inq_response[7] = 0x44;    /* Designator Length */
 
-		sprintf(&inq_response[8], "%04x", dev->pci_dev->vendor);
+		sprintf(&inq_response[8], "%04x", to_pci_dev(dev->dev)->vendor);
 		memcpy(&inq_response[12], dev->model, sizeof(dev->model));
 		sprintf(&inq_response[52], "%04x", tmp_id);
 		memcpy(&inq_response[56], dev->serial, sizeof(dev->serial));
 	}
 	xfer_len = alloc_len;
-	res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
-
- out_free:
-	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem,
-			  dma_addr);
- out_dma:
-	return res;
+	return nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
 }
 
 static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 					int alloc_len)
 {
 	u8 *inq_response;
-	int res = SNTI_TRANSLATION_SUCCESS;
+	int res;
 	int nvme_sc;
 	struct nvme_dev *dev = ns->dev;
-	dma_addr_t dma_addr;
-	void *mem;
 	struct nvme_id_ctrl *id_ctrl;
 	struct nvme_id_ns *id_ns;
 	int xfer_len;
@@ -878,45 +685,32 @@ static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	u8 luiclr = 0x01;
 
 	inq_response = kmalloc(EXTENDED_INQUIRY_DATA_PAGE_LENGTH, GFP_KERNEL);
-	if (inq_response == NULL) {
-		res = -ENOMEM;
-		goto out_mem;
-	}
-
-	mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
-							&dma_addr, GFP_KERNEL);
-	if (mem == NULL) {
-		res = -ENOMEM;
-		goto out_dma;
-	}
+	if (inq_response == NULL)
+		return -ENOMEM;
 
-	/* nvme ns identify */
-	nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+	nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns);
 	res = nvme_trans_status_code(hdr, nvme_sc);
 	if (res)
-		goto out_free;
-	if (nvme_sc) {
-		res = nvme_sc;
-		goto out_free;
-	}
-	id_ns = mem;
-	spt = spt_lut[(id_ns->dpc) & 0x07] << 3;
-	(id_ns->dps) ? (protect = 0x01) : (protect = 0);
+		goto out_free_inq;
+
+	spt = spt_lut[id_ns->dpc & 0x07] << 3;
+	if (id_ns->dps)
+		protect = 0x01;
+	else
+		protect = 0;
+	kfree(id_ns);
+
 	grd_chk = protect << 2;
 	app_chk = protect << 1;
 	ref_chk = protect;
 
-	/* nvme controller identify */
-	nvme_sc = nvme_identify(dev, 0, 1, dma_addr);
+	nvme_sc = nvme_identify_ctrl(dev, &id_ctrl);
 	res = nvme_trans_status_code(hdr, nvme_sc);
 	if (res)
-		goto out_free;
-	if (nvme_sc) {
-		res = nvme_sc;
-		goto out_free;
-	}
-	id_ctrl = mem;
+		goto out_free_inq;
+
 	v_sup = id_ctrl->vwc;
+	kfree(id_ctrl);
 
 	memset(inq_response, 0, EXTENDED_INQUIRY_DATA_PAGE_LENGTH);
 	inq_response[1] = INQ_EXTENDED_INQUIRY_DATA_PAGE;    /* Page Code */
@@ -932,12 +726,8 @@ static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	xfer_len = min(alloc_len, EXTENDED_INQUIRY_DATA_PAGE_LENGTH);
 	res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
 
- out_free:
-	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem,
-			  dma_addr);
- out_dma:
+ out_free_inq:
 	kfree(inq_response);
- out_mem:
 	return res;
 }
 
@@ -965,7 +755,7 @@ static int nvme_trans_bdev_char_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 					int alloc_len)
 {
 	u8 *inq_response;
-	int res = SNTI_TRANSLATION_SUCCESS;
+	int res;
 	int xfer_len;
 
 	inq_response = kzalloc(EXTENDED_INQUIRY_DATA_PAGE_LENGTH, GFP_KERNEL);
@@ -994,7 +784,7 @@ static int nvme_trans_bdev_char_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 static int nvme_trans_log_supp_pages(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 					int alloc_len)
 {
-	int res = SNTI_TRANSLATION_SUCCESS;
+	int res;
 	int xfer_len;
 	u8 *log_response;
 
@@ -1022,47 +812,30 @@ static int nvme_trans_log_supp_pages(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 static int nvme_trans_log_info_exceptions(struct nvme_ns *ns,
 					struct sg_io_hdr *hdr, int alloc_len)
 {
-	int res = SNTI_TRANSLATION_SUCCESS;
+	int res;
 	int xfer_len;
 	u8 *log_response;
-	struct nvme_command c;
 	struct nvme_dev *dev = ns->dev;
 	struct nvme_smart_log *smart_log;
-	dma_addr_t dma_addr;
-	void *mem;
 	u8 temp_c;
 	u16 temp_k;
 
 	log_response = kzalloc(LOG_INFO_EXCP_PAGE_LENGTH, GFP_KERNEL);
-	if (log_response == NULL) {
-		res = -ENOMEM;
-		goto out_mem;
-	}
+	if (log_response == NULL)
+		return -ENOMEM;
 
-	mem = dma_alloc_coherent(&dev->pci_dev->dev,
-					sizeof(struct nvme_smart_log),
-					&dma_addr, GFP_KERNEL);
-	if (mem == NULL) {
-		res = -ENOMEM;
-		goto out_dma;
-	}
+	res = nvme_get_log_page(dev, &smart_log);
+	if (res < 0)
+		goto out_free_response;
 
-	/* Get SMART Log Page */
-	memset(&c, 0, sizeof(c));
-	c.common.opcode = nvme_admin_get_log_page;
-	c.common.nsid = cpu_to_le32(0xFFFFFFFF);
-	c.common.prp1 = cpu_to_le64(dma_addr);
-	c.common.cdw10[0] = cpu_to_le32((((sizeof(struct nvme_smart_log) /
-			BYTES_TO_DWORDS) - 1) << 16) | NVME_LOG_SMART);
-	res = nvme_submit_admin_cmd(dev, &c, NULL);
 	if (res != NVME_SC_SUCCESS) {
 		temp_c = LOG_TEMP_UNKNOWN;
 	} else {
-		smart_log = mem;
 		temp_k = (smart_log->temperature[1] << 8) +
 				(smart_log->temperature[0]);
 		temp_c = temp_k - KELVIN_TEMP_FACTOR;
 	}
+	kfree(smart_log);
 
 	log_response[0] = LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE;
 	/* Subpage=0x00, Page Length MSB=0 */
@@ -1078,59 +851,39 @@ static int nvme_trans_log_info_exceptions(struct nvme_ns *ns,
 	xfer_len = min(alloc_len, LOG_INFO_EXCP_PAGE_LENGTH);
 	res = nvme_trans_copy_to_user(hdr, log_response, xfer_len);
 
-	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_smart_log),
-			  mem, dma_addr);
- out_dma:
+ out_free_response:
 	kfree(log_response);
- out_mem:
 	return res;
 }
 
 static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 					int alloc_len)
 {
-	int res = SNTI_TRANSLATION_SUCCESS;
+	int res;
 	int xfer_len;
 	u8 *log_response;
-	struct nvme_command c;
 	struct nvme_dev *dev = ns->dev;
 	struct nvme_smart_log *smart_log;
-	dma_addr_t dma_addr;
-	void *mem;
 	u32 feature_resp;
 	u8 temp_c_cur, temp_c_thresh;
 	u16 temp_k;
 
 	log_response = kzalloc(LOG_TEMP_PAGE_LENGTH, GFP_KERNEL);
-	if (log_response == NULL) {
-		res = -ENOMEM;
-		goto out_mem;
-	}
+	if (log_response == NULL)
+		return -ENOMEM;
 
-	mem = dma_alloc_coherent(&dev->pci_dev->dev,
-					sizeof(struct nvme_smart_log),
-					&dma_addr, GFP_KERNEL);
-	if (mem == NULL) {
-		res = -ENOMEM;
-		goto out_dma;
-	}
+	res = nvme_get_log_page(dev, &smart_log);
+	if (res < 0)
+		goto out_free_response;
 
-	/* Get SMART Log Page */
-	memset(&c, 0, sizeof(c));
-	c.common.opcode = nvme_admin_get_log_page;
-	c.common.nsid = cpu_to_le32(0xFFFFFFFF);
-	c.common.prp1 = cpu_to_le64(dma_addr);
-	c.common.cdw10[0] = cpu_to_le32((((sizeof(struct nvme_smart_log) /
-			BYTES_TO_DWORDS) - 1) << 16) | NVME_LOG_SMART);
-	res = nvme_submit_admin_cmd(dev, &c, NULL);
 	if (res != NVME_SC_SUCCESS) {
 		temp_c_cur = LOG_TEMP_UNKNOWN;
 	} else {
-		smart_log = mem;
 		temp_k = (smart_log->temperature[1] << 8) +
 				(smart_log->temperature[0]);
 		temp_c_cur = temp_k - KELVIN_TEMP_FACTOR;
 	}
+	kfree(smart_log);
 
 	/* Get Features for Temp Threshold */
 	res = nvme_get_features(dev, NVME_FEAT_TEMP_THRESH, 0, 0,
@@ -1159,11 +912,8 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	xfer_len = min(alloc_len, LOG_TEMP_PAGE_LENGTH);
 	res = nvme_trans_copy_to_user(hdr, log_response, xfer_len);
 
-	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_smart_log),
-			  mem, dma_addr);
- out_dma:
+ out_free_response:
 	kfree(log_response);
- out_mem:
 	return res;
 }
 
@@ -1174,59 +924,45 @@ static int nvme_trans_fill_mode_parm_hdr(u8 *resp, int len, u8 cdb10, u8 llbaa,
 {
 	/* Quick check to make sure I don't stomp on my own memory... */
 	if ((cdb10 && len < 8) || (!cdb10 && len < 4))
-		return SNTI_INTERNAL_ERROR;
+		return -EINVAL;
 
 	if (cdb10) {
 		resp[0] = (mode_data_length & 0xFF00) >> 8;
 		resp[1] = (mode_data_length & 0x00FF);
-		/* resp[2] and [3] are zero */
+		resp[3] = 0x10 /* DPOFUA */;
 		resp[4] = llbaa;
 		resp[5] = RESERVED_FIELD;
 		resp[6] = (blk_desc_len & 0xFF00) >> 8;
 		resp[7] = (blk_desc_len & 0x00FF);
 	} else {
 		resp[0] = (mode_data_length & 0x00FF);
-		/* resp[1] and [2] are zero */
+		resp[2] = 0x10 /* DPOFUA */;
 		resp[3] = (blk_desc_len & 0x00FF);
 	}
 
-	return SNTI_TRANSLATION_SUCCESS;
+	return 0;
 }
 
 static int nvme_trans_fill_blk_desc(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 				    u8 *resp, int len, u8 llbaa)
 {
-	int res = SNTI_TRANSLATION_SUCCESS;
+	int res;
 	int nvme_sc;
 	struct nvme_dev *dev = ns->dev;
-	dma_addr_t dma_addr;
-	void *mem;
 	struct nvme_id_ns *id_ns;
 	u8 flbas;
 	u32 lba_length;
 
 	if (llbaa == 0 && len < MODE_PAGE_BLK_DES_LEN)
-		return SNTI_INTERNAL_ERROR;
+		return -EINVAL;
 	else if (llbaa > 0 && len < MODE_PAGE_LLBAA_BLK_DES_LEN)
-		return SNTI_INTERNAL_ERROR;
-
-	mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
-							&dma_addr, GFP_KERNEL);
-	if (mem == NULL) {
-		res = -ENOMEM;
-		goto out;
-	}
+		return -EINVAL;
 
-	/* nvme ns identify */
-	nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+	nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns);
 	res = nvme_trans_status_code(hdr, nvme_sc);
 	if (res)
-		goto out_dma;
-	if (nvme_sc) {
-		res = nvme_sc;
-		goto out_dma;
-	}
-	id_ns = mem;
+		return res;
+
 	flbas = (id_ns->flbas) & 0x0F;
 	lba_length = (1 << (id_ns->lbaf[flbas].ds));
 
@@ -1246,10 +982,7 @@ static int nvme_trans_fill_blk_desc(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 		memcpy(&resp[12], &tmp_len, sizeof(u32));
 	}
 
- out_dma:
-	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem,
-			  dma_addr);
- out:
+	kfree(id_ns);
 	return res;
 }
 
@@ -1258,7 +991,7 @@ static int nvme_trans_fill_control_page(struct nvme_ns *ns,
 					int len)
 {
 	if (len < MODE_PAGE_CONTROL_LEN)
-		return SNTI_INTERNAL_ERROR;
+		return -EINVAL;
 
 	resp[0] = MODE_PAGE_CONTROL;
 	resp[1] = MODE_PAGE_CONTROL_LEN_FIELD;
@@ -1272,78 +1005,69 @@ static int nvme_trans_fill_control_page(struct nvme_ns *ns,
 	resp[9] = 0xFF;
 	/* Bytes 10,11: Extended selftest completion time = 0x0000 */
 
-	return SNTI_TRANSLATION_SUCCESS;
+	return 0;
 }
 
 static int nvme_trans_fill_caching_page(struct nvme_ns *ns,
 					struct sg_io_hdr *hdr,
 					u8 *resp, int len)
 {
-	int res = SNTI_TRANSLATION_SUCCESS;
+	int res = 0;
 	int nvme_sc;
 	struct nvme_dev *dev = ns->dev;
 	u32 feature_resp;
 	u8 vwc;
 
 	if (len < MODE_PAGE_CACHING_LEN)
-		return SNTI_INTERNAL_ERROR;
+		return -EINVAL;
 
 	nvme_sc = nvme_get_features(dev, NVME_FEAT_VOLATILE_WC, 0, 0,
 								&feature_resp);
 	res = nvme_trans_status_code(hdr, nvme_sc);
 	if (res)
-		goto out;
-	if (nvme_sc) {
-		res = nvme_sc;
-		goto out;
-	}
+		return res;
+
 	vwc = feature_resp & 0x00000001;
 
 	resp[0] = MODE_PAGE_CACHING;
 	resp[1] = MODE_PAGE_CACHING_LEN_FIELD;
 	resp[2] = vwc << 2;
-
- out:
-	return res;
+	return 0;
 }
 
 static int nvme_trans_fill_pow_cnd_page(struct nvme_ns *ns,
 					struct sg_io_hdr *hdr, u8 *resp,
 					int len)
 {
-	int res = SNTI_TRANSLATION_SUCCESS;
-
 	if (len < MODE_PAGE_POW_CND_LEN)
-		return SNTI_INTERNAL_ERROR;
+		return -EINVAL;
 
 	resp[0] = MODE_PAGE_POWER_CONDITION;
 	resp[1] = MODE_PAGE_POW_CND_LEN_FIELD;
 	/* All other bytes are zero */
 
-	return res;
+	return 0;
 }
 
 static int nvme_trans_fill_inf_exc_page(struct nvme_ns *ns,
 					struct sg_io_hdr *hdr, u8 *resp,
 					int len)
 {
-	int res = SNTI_TRANSLATION_SUCCESS;
-
 	if (len < MODE_PAGE_INF_EXC_LEN)
-		return SNTI_INTERNAL_ERROR;
+		return -EINVAL;
 
 	resp[0] = MODE_PAGE_INFO_EXCEP;
 	resp[1] = MODE_PAGE_INF_EXC_LEN_FIELD;
 	resp[2] = 0x88;
 	/* All other bytes are zero */
 
-	return res;
+	return 0;
 }
 
 static int nvme_trans_fill_all_pages(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 				     u8 *resp, int len)
 {
-	int res = SNTI_TRANSLATION_SUCCESS;
+	int res;
 	u16 mode_pages_offset_1 = 0;
 	u16 mode_pages_offset_2, mode_pages_offset_3, mode_pages_offset_4;
 
@@ -1353,23 +1077,18 @@ static int nvme_trans_fill_all_pages(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 
 	res = nvme_trans_fill_caching_page(ns, hdr, &resp[mode_pages_offset_1],
 					MODE_PAGE_CACHING_LEN);
-	if (res != SNTI_TRANSLATION_SUCCESS)
-		goto out;
+	if (res)
+		return res;
 	res = nvme_trans_fill_control_page(ns, hdr, &resp[mode_pages_offset_2],
 					MODE_PAGE_CONTROL_LEN);
-	if (res != SNTI_TRANSLATION_SUCCESS)
-		goto out;
+	if (res)
+		return res;
 	res = nvme_trans_fill_pow_cnd_page(ns, hdr, &resp[mode_pages_offset_3],
 					MODE_PAGE_POW_CND_LEN);
-	if (res != SNTI_TRANSLATION_SUCCESS)
-		goto out;
-	res = nvme_trans_fill_inf_exc_page(ns, hdr, &resp[mode_pages_offset_4],
+	if (res)
+		return res;
+	return nvme_trans_fill_inf_exc_page(ns, hdr, &resp[mode_pages_offset_4],
 					MODE_PAGE_INF_EXC_LEN);
-	if (res != SNTI_TRANSLATION_SUCCESS)
-		goto out;
-
- out:
-	return res;
 }
 
 static inline int nvme_trans_get_blk_desc_len(u8 dbd, u8 llbaa)
@@ -1390,7 +1109,7 @@ static int nvme_trans_mode_page_create(struct nvme_ns *ns,
 					struct sg_io_hdr *hdr, u8 *, int),
 					u16 mode_pages_tot_len)
 {
-	int res = SNTI_TRANSLATION_SUCCESS;
+	int res;
 	int xfer_len;
 	u8 *response;
 	u8 dbd, llbaa;
@@ -1399,9 +1118,10 @@ static int nvme_trans_mode_page_create(struct nvme_ns *ns,
 	u16 mode_pages_offset_1;
 	u16 blk_desc_len, blk_desc_offset, mode_data_length;
 
-	dbd = GET_MODE_SENSE_DBD(cmd);
-	llbaa = GET_MODE_SENSE_LLBAA(cmd);
-	mph_size = GET_MODE_SENSE_MPH_SIZE(cdb10);
+	dbd = (cmd[1] & MODE_SENSE_DBD_MASK) >> MODE_SENSE_DBD_SHIFT;
+	llbaa = (cmd[1] & MODE_SENSE_LLBAA_MASK) >> MODE_SENSE_LLBAA_SHIFT;
+	mph_size = cdb10 ? MODE_SENSE10_MPH_SIZE : MODE_SENSE6_MPH_SIZE;
+
 	blk_desc_len = nvme_trans_get_blk_desc_len(dbd, llbaa);
 
 	resp_size = mph_size + blk_desc_len + mode_pages_tot_len;
@@ -1419,18 +1139,18 @@ static int nvme_trans_mode_page_create(struct nvme_ns *ns,
 
 	res = nvme_trans_fill_mode_parm_hdr(&response[0], mph_size, cdb10,
 					llbaa, mode_data_length, blk_desc_len);
-	if (res != SNTI_TRANSLATION_SUCCESS)
+	if (res)
 		goto out_free;
 	if (blk_desc_len > 0) {
 		res = nvme_trans_fill_blk_desc(ns, hdr,
 					       &response[blk_desc_offset],
 					       blk_desc_len, llbaa);
-		if (res != SNTI_TRANSLATION_SUCCESS)
+		if (res)
 			goto out_free;
 	}
 	res = mode_page_fill_func(ns, hdr, &response[mode_pages_offset_1],
 					mode_pages_tot_len);
-	if (res != SNTI_TRANSLATION_SUCCESS)
+	if (res)
 		goto out_free;
 
 	xfer_len = min(alloc_len, resp_size);
@@ -1485,33 +1205,20 @@ static void nvme_trans_fill_read_cap(u8 *response, struct nvme_id_ns *id_ns,
 static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 						u8 pc, u8 pcmod, u8 start)
 {
-	int res = SNTI_TRANSLATION_SUCCESS;
+	int res;
 	int nvme_sc;
 	struct nvme_dev *dev = ns->dev;
-	dma_addr_t dma_addr;
-	void *mem;
 	struct nvme_id_ctrl *id_ctrl;
 	int lowest_pow_st;	/* max npss = lowest power consumption */
 	unsigned ps_desired = 0;
 
-	/* NVMe Controller Identify */
-	mem = dma_alloc_coherent(&dev->pci_dev->dev,
-				sizeof(struct nvme_id_ctrl),
-				&dma_addr, GFP_KERNEL);
-	if (mem == NULL) {
-		res = -ENOMEM;
-		goto out;
-	}
-	nvme_sc = nvme_identify(dev, 0, 1, dma_addr);
+	nvme_sc = nvme_identify_ctrl(dev, &id_ctrl);
 	res = nvme_trans_status_code(hdr, nvme_sc);
 	if (res)
-		goto out_dma;
-	if (nvme_sc) {
-		res = nvme_sc;
-		goto out_dma;
-	}
-	id_ctrl = mem;
+		return res;
+
 	lowest_pow_st = max(POWER_STATE_0, (int)(id_ctrl->npss - 1));
+	kfree(id_ctrl);
 
 	switch (pc) {
 	case NVME_POWER_STATE_START_VALID:
@@ -1551,79 +1258,48 @@ static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	}
 	nvme_sc = nvme_set_features(dev, NVME_FEAT_POWER_MGMT, ps_desired, 0,
 				    NULL);
-	res = nvme_trans_status_code(hdr, nvme_sc);
-	if (res)
-		goto out_dma;
-	if (nvme_sc)
-		res = nvme_sc;
- out_dma:
-	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ctrl), mem,
-			  dma_addr);
- out:
-	return res;
+	return nvme_trans_status_code(hdr, nvme_sc);
 }
 
-/* Write Buffer Helper Functions */
-/* Also using this for Format Unit with hdr passed as NULL, and buffer_id, 0 */
+static int nvme_trans_send_activate_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+					u8 buffer_id)
+{
+	struct nvme_command c;
+	int nvme_sc;
+
+	memset(&c, 0, sizeof(c));
+	c.common.opcode = nvme_admin_activate_fw;
+	c.common.cdw10[0] = cpu_to_le32(buffer_id | NVME_FWACT_REPL_ACTV);
 
-static int nvme_trans_send_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+	nvme_sc = nvme_submit_sync_cmd(ns->queue, &c, NULL, 0);
+	return nvme_trans_status_code(hdr, nvme_sc);
+}
+
+static int nvme_trans_send_download_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 					u8 opcode, u32 tot_len, u32 offset,
 					u8 buffer_id)
 {
-	int res = SNTI_TRANSLATION_SUCCESS;
 	int nvme_sc;
 	struct nvme_dev *dev = ns->dev;
 	struct nvme_command c;
-	struct nvme_iod *iod = NULL;
-	unsigned length;
 
-	memset(&c, 0, sizeof(c));
-	c.common.opcode = opcode;
-	if (opcode == nvme_admin_download_fw) {
-		if (hdr->iovec_count > 0) {
-			/* Assuming SGL is not allowed for this command */
-			res = nvme_trans_completion(hdr,
-						SAM_STAT_CHECK_CONDITION,
-						ILLEGAL_REQUEST,
-						SCSI_ASC_INVALID_CDB,
-						SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
-			goto out;
-		}
-		iod = nvme_map_user_pages(dev, DMA_TO_DEVICE,
-				(unsigned long)hdr->dxferp, tot_len);
-		if (IS_ERR(iod)) {
-			res = PTR_ERR(iod);
-			goto out;
-		}
-		length = nvme_setup_prps(dev, iod, tot_len, GFP_KERNEL);
-		if (length != tot_len) {
-			res = -ENOMEM;
-			goto out_unmap;
-		}
-
-		c.dlfw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
-		c.dlfw.prp2 = cpu_to_le64(iod->first_dma);
-		c.dlfw.numd = cpu_to_le32((tot_len/BYTES_TO_DWORDS) - 1);
-		c.dlfw.offset = cpu_to_le32(offset/BYTES_TO_DWORDS);
-	} else if (opcode == nvme_admin_activate_fw) {
-		u32 cdw10 = buffer_id | NVME_FWACT_REPL_ACTV;
-		c.common.cdw10[0] = cpu_to_le32(cdw10);
+	if (hdr->iovec_count > 0) {
+		/* Assuming SGL is not allowed for this command */
+		return nvme_trans_completion(hdr,
+					SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST,
+					SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
 	}
 
-	nvme_sc = nvme_submit_admin_cmd(dev, &c, NULL);
-	res = nvme_trans_status_code(hdr, nvme_sc);
-	if (res)
-		goto out_unmap;
-	if (nvme_sc)
-		res = nvme_sc;
-
- out_unmap:
-	if (opcode == nvme_admin_download_fw) {
-		nvme_unmap_user_pages(dev, DMA_TO_DEVICE, iod);
-		nvme_free_iod(dev, iod);
-	}
- out:
-	return res;
+	memset(&c, 0, sizeof(c));
+	c.common.opcode = nvme_admin_download_fw;
+	c.dlfw.numd = cpu_to_le32((tot_len/BYTES_TO_DWORDS) - 1);
+	c.dlfw.offset = cpu_to_le32(offset/BYTES_TO_DWORDS);
+
+	nvme_sc = __nvme_submit_sync_cmd(dev->admin_q, &c, NULL,
+			hdr->dxferp, tot_len, NULL, 0);
+	return nvme_trans_status_code(hdr, nvme_sc);
 }
 
 /* Mode Select Helper Functions */
@@ -1686,7 +1362,7 @@ static void nvme_trans_modesel_save_bd(struct nvme_ns *ns, u8 *parm_list,
 static int nvme_trans_modesel_get_mp(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 					u8 *mode_page, u8 page_code)
 {
-	int res = SNTI_TRANSLATION_SUCCESS;
+	int res = 0;
 	int nvme_sc;
 	struct nvme_dev *dev = ns->dev;
 	unsigned dword11;
@@ -1697,12 +1373,6 @@ static int nvme_trans_modesel_get_mp(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 		nvme_sc = nvme_set_features(dev, NVME_FEAT_VOLATILE_WC, dword11,
 					    0, NULL);
 		res = nvme_trans_status_code(hdr, nvme_sc);
-		if (res)
-			break;
-		if (nvme_sc) {
-			res = nvme_sc;
-			break;
-		}
 		break;
 	case MODE_PAGE_CONTROL:
 		break;
@@ -1714,8 +1384,6 @@ static int nvme_trans_modesel_get_mp(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 						ILLEGAL_REQUEST,
 						SCSI_ASC_INVALID_PARAMETER,
 						SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
-			if (!res)
-				res = SNTI_INTERNAL_ERROR;
 			break;
 		}
 		break;
@@ -1723,8 +1391,6 @@ static int nvme_trans_modesel_get_mp(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
 					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
 					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
-		if (!res)
-			res = SNTI_INTERNAL_ERROR;
 		break;
 	}
 
@@ -1735,7 +1401,7 @@ static int nvme_trans_modesel_data(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 					u8 *cmd, u16 parm_list_len, u8 pf,
 					u8 sp, u8 cdb10)
 {
-	int res = SNTI_TRANSLATION_SUCCESS;
+	int res;
 	u8 *parm_list;
 	u16 bd_len;
 	u8 llbaa = 0;
@@ -1751,7 +1417,7 @@ static int nvme_trans_modesel_data(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	}
 
 	res = nvme_trans_copy_from_user(hdr, parm_list, parm_list_len);
-	if (res != SNTI_TRANSLATION_SUCCESS)
+	if (res)
 		goto out_mem;
 
 	nvme_trans_modesel_get_bd_len(parm_list, cdb10, &bd_len, &llbaa);
@@ -1789,7 +1455,7 @@ static int nvme_trans_modesel_data(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 		mp_size = parm_list[index + 1] + 2;
 		res = nvme_trans_modesel_get_mp(ns, hdr, &parm_list[index],
 								page_code);
-		if (res != SNTI_TRANSLATION_SUCCESS)
+		if (res)
 			break;
 		index += mp_size;
 	} while (index < parm_list_len);
@@ -1805,12 +1471,9 @@ static int nvme_trans_modesel_data(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns,
 					     struct sg_io_hdr *hdr)
 {
-	int res = SNTI_TRANSLATION_SUCCESS;
+	int res = 0;
 	int nvme_sc;
 	struct nvme_dev *dev = ns->dev;
-	dma_addr_t dma_addr;
-	void *mem;
-	struct nvme_id_ns *id_ns;
 	u8 flbas;
 
 	/*
@@ -1821,22 +1484,12 @@ static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns,
 	 */
 
 	if (ns->mode_select_num_blocks == 0 || ns->mode_select_block_len == 0) {
-		mem = dma_alloc_coherent(&dev->pci_dev->dev,
-			sizeof(struct nvme_id_ns), &dma_addr, GFP_KERNEL);
-		if (mem == NULL) {
-			res = -ENOMEM;
-			goto out;
-		}
-		/* nvme ns identify */
-		nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+		struct nvme_id_ns *id_ns;
+
+		nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns);
 		res = nvme_trans_status_code(hdr, nvme_sc);
 		if (res)
-			goto out_dma;
-		if (nvme_sc) {
-			res = nvme_sc;
-			goto out_dma;
-		}
-		id_ns = mem;
+			return res;
 
 		if (ns->mode_select_num_blocks == 0)
 			ns->mode_select_num_blocks = le64_to_cpu(id_ns->ncap);
@@ -1845,18 +1498,17 @@ static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns,
 			ns->mode_select_block_len =
 						(1 << (id_ns->lbaf[flbas].ds));
 		}
- out_dma:
-		dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
-				  mem, dma_addr);
+
+		kfree(id_ns);
 	}
- out:
-	return res;
+
+	return 0;
 }
 
 static int nvme_trans_fmt_get_parm_header(struct sg_io_hdr *hdr, u8 len,
 					u8 format_prot_info, u8 *nvme_pf_code)
 {
-	int res = SNTI_TRANSLATION_SUCCESS;
+	int res;
 	u8 *parm_list;
 	u8 pf_usage, pf_code;
 
@@ -1866,7 +1518,7 @@ static int nvme_trans_fmt_get_parm_header(struct sg_io_hdr *hdr, u8 len,
 		goto out;
 	}
 	res = nvme_trans_copy_from_user(hdr, parm_list, len);
-	if (res != SNTI_TRANSLATION_SUCCESS)
+	if (res)
 		goto out_mem;
 
 	if ((parm_list[FORMAT_UNIT_IMMED_OFFSET] &
@@ -1916,11 +1568,9 @@ static int nvme_trans_fmt_get_parm_header(struct sg_io_hdr *hdr, u8 len,
 static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 				   u8 prot_info)
 {
-	int res = SNTI_TRANSLATION_SUCCESS;
+	int res;
 	int nvme_sc;
 	struct nvme_dev *dev = ns->dev;
-	dma_addr_t dma_addr;
-	void *mem;
 	struct nvme_id_ns *id_ns;
 	u8 i;
 	u8 flbas, nlbaf;
@@ -1929,22 +1579,11 @@ static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	struct nvme_command c;
 
 	/* Loop thru LBAF's in id_ns to match reqd lbaf, put in cdw10 */
-	mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
-							&dma_addr, GFP_KERNEL);
-	if (mem == NULL) {
-		res = -ENOMEM;
-		goto out;
-	}
-	/* nvme ns identify */
-	nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+	nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns);
 	res = nvme_trans_status_code(hdr, nvme_sc);
 	if (res)
-		goto out_dma;
-	if (nvme_sc) {
-		res = nvme_sc;
-		goto out_dma;
-	}
-	id_ns = mem;
+		return res;
+
 	flbas = (id_ns->flbas) & 0x0F;
 	nlbaf = id_ns->nlbaf;
 
@@ -1972,69 +1611,13 @@ static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	c.format.nsid = cpu_to_le32(ns->ns_id);
 	c.format.cdw10 = cpu_to_le32(cdw10);
 
-	nvme_sc = nvme_submit_admin_cmd(dev, &c, NULL);
+	nvme_sc = nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0);
 	res = nvme_trans_status_code(hdr, nvme_sc);
-	if (res)
-		goto out_dma;
-	if (nvme_sc)
-		res = nvme_sc;
 
- out_dma:
-	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem,
-			  dma_addr);
- out:
+	kfree(id_ns);
 	return res;
 }
 
-/* Read/Write Helper Functions */
-
-static inline void nvme_trans_get_io_cdb6(u8 *cmd,
-					struct nvme_trans_io_cdb *cdb_info)
-{
-	cdb_info->fua = 0;
-	cdb_info->prot_info = 0;
-	cdb_info->lba = GET_U32_FROM_CDB(cmd, IO_6_CDB_LBA_OFFSET) &
-					IO_6_CDB_LBA_MASK;
-	cdb_info->xfer_len = GET_U8_FROM_CDB(cmd, IO_6_CDB_TX_LEN_OFFSET);
-
-	/* sbc3r27 sec 5.32 - TRANSFER LEN of 0 implies a 256 Block transfer */
-	if (cdb_info->xfer_len == 0)
-		cdb_info->xfer_len = IO_6_DEFAULT_TX_LEN;
-}
-
-static inline void nvme_trans_get_io_cdb10(u8 *cmd,
-					struct nvme_trans_io_cdb *cdb_info)
-{
-	cdb_info->fua = GET_U8_FROM_CDB(cmd, IO_10_CDB_FUA_OFFSET) &
-					IO_CDB_FUA_MASK;
-	cdb_info->prot_info = GET_U8_FROM_CDB(cmd, IO_10_CDB_WP_OFFSET) &
-					IO_CDB_WP_MASK >> IO_CDB_WP_SHIFT;
-	cdb_info->lba = GET_U32_FROM_CDB(cmd, IO_10_CDB_LBA_OFFSET);
-	cdb_info->xfer_len = GET_U16_FROM_CDB(cmd, IO_10_CDB_TX_LEN_OFFSET);
-}
-
-static inline void nvme_trans_get_io_cdb12(u8 *cmd,
-					struct nvme_trans_io_cdb *cdb_info)
-{
-	cdb_info->fua = GET_U8_FROM_CDB(cmd, IO_12_CDB_FUA_OFFSET) &
-					IO_CDB_FUA_MASK;
-	cdb_info->prot_info = GET_U8_FROM_CDB(cmd, IO_12_CDB_WP_OFFSET) &
-					IO_CDB_WP_MASK >> IO_CDB_WP_SHIFT;
-	cdb_info->lba = GET_U32_FROM_CDB(cmd, IO_12_CDB_LBA_OFFSET);
-	cdb_info->xfer_len = GET_U32_FROM_CDB(cmd, IO_12_CDB_TX_LEN_OFFSET);
-}
-
-static inline void nvme_trans_get_io_cdb16(u8 *cmd,
-					struct nvme_trans_io_cdb *cdb_info)
-{
-	cdb_info->fua = GET_U8_FROM_CDB(cmd, IO_16_CDB_FUA_OFFSET) &
-					IO_CDB_FUA_MASK;
-	cdb_info->prot_info = GET_U8_FROM_CDB(cmd, IO_16_CDB_WP_OFFSET) &
-					IO_CDB_WP_MASK >> IO_CDB_WP_SHIFT;
-	cdb_info->lba = GET_U64_FROM_CDB(cmd, IO_16_CDB_LBA_OFFSET);
-	cdb_info->xfer_len = GET_U32_FROM_CDB(cmd, IO_16_CDB_TX_LEN_OFFSET);
-}
-
 static inline u32 nvme_trans_io_get_num_cmds(struct sg_io_hdr *hdr,
 					struct nvme_trans_io_cdb *cdb_info,
 					u32 max_blocks)
@@ -2064,11 +1647,8 @@ static u16 nvme_trans_io_get_control(struct nvme_ns *ns,
 static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 				struct nvme_trans_io_cdb *cdb_info, u8 is_write)
 {
-	int res = SNTI_TRANSLATION_SUCCESS;
-	int nvme_sc;
-	struct nvme_dev *dev = ns->dev;
+	int nvme_sc = NVME_SC_SUCCESS;
 	u32 num_cmds;
-	struct nvme_iod *iod;
 	u64 unit_len;
 	u64 unit_num_blocks;	/* Number of blocks to xfer in each nvme cmd */
 	u32 retcode;
@@ -2119,45 +1699,20 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 		control = nvme_trans_io_get_control(ns, cdb_info);
 		c.rw.control = cpu_to_le16(control);
 
-		iod = nvme_map_user_pages(dev,
-			(is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
-			(unsigned long)next_mapping_addr, unit_len);
-		if (IS_ERR(iod)) {
-			res = PTR_ERR(iod);
-			goto out;
-		}
-		retcode = nvme_setup_prps(dev, iod, unit_len, GFP_KERNEL);
-		if (retcode != unit_len) {
-			nvme_unmap_user_pages(dev,
-				(is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
-				iod);
-			nvme_free_iod(dev, iod);
-			res = -ENOMEM;
-			goto out;
+		if (get_capacity(ns->disk) - unit_num_blocks <
+				cdb_info->lba + nvme_offset) {
+			nvme_sc = NVME_SC_LBA_RANGE;
+			break;
 		}
-		c.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
-		c.rw.prp2 = cpu_to_le64(iod->first_dma);
+		nvme_sc = __nvme_submit_sync_cmd(ns->queue, &c, NULL,
+				next_mapping_addr, unit_len, NULL, 0);
+		if (nvme_sc)
+			break;
 
 		nvme_offset += unit_num_blocks;
-
-		nvme_sc = nvme_submit_io_cmd(dev, ns, &c, NULL);
-		if (nvme_sc != NVME_SC_SUCCESS) {
-			nvme_unmap_user_pages(dev,
-				(is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
-				iod);
-			nvme_free_iod(dev, iod);
-			res = nvme_trans_status_code(hdr, nvme_sc);
-			goto out;
-		}
-		nvme_unmap_user_pages(dev,
-				(is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
-				iod);
-		nvme_free_iod(dev, iod);
 	}
-	res = nvme_trans_status_code(hdr, NVME_SC_SUCCESS);
 
- out:
-	return res;
+	return nvme_trans_status_code(hdr, nvme_sc);
 }
 
 
@@ -2166,8 +1721,8 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 static int nvme_trans_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 is_write,
 							u8 *cmd)
 {
-	int res = SNTI_TRANSLATION_SUCCESS;
-	struct nvme_trans_io_cdb cdb_info;
+	int res = 0;
+	struct nvme_trans_io_cdb cdb_info = { 0, };
 	u8 opcode = cmd[0];
 	u64 xfer_bytes;
 	u64 sum_iov_len = 0;
@@ -2175,27 +1730,52 @@ static int nvme_trans_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 is_write,
 	int i;
 	size_t not_copied;
 
-	/* Extract Fields from CDB */
+	/*
+	 * The FUA and WPROTECT fields are not supported in 6-byte CDBs,
+	 * but always in the same place for all others.
+	 */
 	switch (opcode) {
 	case WRITE_6:
 	case READ_6:
-		nvme_trans_get_io_cdb6(cmd, &cdb_info);
+		break;
+	default:
+		cdb_info.fua = cmd[1] & 0x8;
+		cdb_info.prot_info = (cmd[1] & 0xe0) >> 5;
+		if (cdb_info.prot_info && !ns->pi_type) {
+			return nvme_trans_completion(hdr,
+					SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST,
+					SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		}
+	}
+
+	switch (opcode) {
+	case WRITE_6:
+	case READ_6:
+		cdb_info.lba = get_unaligned_be24(&cmd[1]);
+		cdb_info.xfer_len = cmd[4];
+		if (cdb_info.xfer_len == 0)
+			cdb_info.xfer_len = 256;
 		break;
 	case WRITE_10:
 	case READ_10:
-		nvme_trans_get_io_cdb10(cmd, &cdb_info);
+		cdb_info.lba = get_unaligned_be32(&cmd[2]);
+		cdb_info.xfer_len = get_unaligned_be16(&cmd[7]);
 		break;
 	case WRITE_12:
 	case READ_12:
-		nvme_trans_get_io_cdb12(cmd, &cdb_info);
+		cdb_info.lba = get_unaligned_be32(&cmd[2]);
+		cdb_info.xfer_len = get_unaligned_be32(&cmd[6]);
 		break;
 	case WRITE_16:
 	case READ_16:
-		nvme_trans_get_io_cdb16(cmd, &cdb_info);
+		cdb_info.lba = get_unaligned_be64(&cmd[2]);
+		cdb_info.xfer_len = get_unaligned_be32(&cmd[10]);
 		break;
 	default:
 		/* Will never really reach here */
-		res = SNTI_INTERNAL_ERROR;
+		res = -EIO;
 		goto out;
 	}
 
@@ -2237,7 +1817,7 @@ static int nvme_trans_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 is_write,
 
 	/* Send NVMe IO Command(s) */
 	res = nvme_trans_do_nvme_io(ns, hdr, &cdb_info, is_write);
-	if (res != SNTI_TRANSLATION_SUCCESS)
+	if (res)
 		goto out;
 
  out:
@@ -2247,15 +1827,15 @@ static int nvme_trans_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 is_write,
 static int nvme_trans_inquiry(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 							u8 *cmd)
 {
-	int res = SNTI_TRANSLATION_SUCCESS;
+	int res = 0;
 	u8 evpd;
 	u8 page_code;
 	int alloc_len;
 	u8 *inq_response;
 
-	evpd = GET_INQ_EVPD_BIT(cmd);
-	page_code = GET_INQ_PAGE_CODE(cmd);
-	alloc_len = GET_INQ_ALLOC_LENGTH(cmd);
+	evpd = cmd[1] & 0x01;
+	page_code = cmd[2];
+	alloc_len = get_unaligned_be16(&cmd[3]);
 
 	inq_response = kmalloc(max(alloc_len, STANDARD_INQUIRY_LENGTH),
 				GFP_KERNEL);
@@ -2316,29 +1896,27 @@ static int nvme_trans_inquiry(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 static int nvme_trans_log_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 							u8 *cmd)
 {
-	int res = SNTI_TRANSLATION_SUCCESS;
+	int res;
 	u16 alloc_len;
-	u8 sp;
 	u8 pc;
 	u8 page_code;
 
-	sp = GET_U8_FROM_CDB(cmd, LOG_SENSE_CDB_SP_OFFSET);
-	if (sp != LOG_SENSE_CDB_SP_NOT_ENABLED) {
+	if (cmd[1] != LOG_SENSE_CDB_SP_NOT_ENABLED) {
 		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
 					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
 					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
 		goto out;
 	}
-	pc = GET_U8_FROM_CDB(cmd, LOG_SENSE_CDB_PC_OFFSET);
-	page_code = pc & LOG_SENSE_CDB_PAGE_CODE_MASK;
-	pc = (pc & LOG_SENSE_CDB_PC_MASK) >> LOG_SENSE_CDB_PC_SHIFT;
+
+	page_code = cmd[2] & LOG_SENSE_CDB_PAGE_CODE_MASK;
+	pc = (cmd[2] & LOG_SENSE_CDB_PC_MASK) >> LOG_SENSE_CDB_PC_SHIFT;
 	if (pc != LOG_SENSE_CDB_PC_CUMULATIVE_VALUES) {
 		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
 					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
 					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
 		goto out;
 	}
-	alloc_len = GET_U16_FROM_CDB(cmd, LOG_SENSE_CDB_ALLOC_LENGTH_OFFSET);
+	alloc_len = get_unaligned_be16(&cmd[7]);
 	switch (page_code) {
 	case LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE:
 		res = nvme_trans_log_supp_pages(ns, hdr, alloc_len);
@@ -2363,24 +1941,18 @@ static int nvme_trans_log_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 static int nvme_trans_mode_select(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 							u8 *cmd)
 {
-	int res = SNTI_TRANSLATION_SUCCESS;
 	u8 cdb10 = 0;
 	u16 parm_list_len;
 	u8 page_format;
 	u8 save_pages;
 
-	page_format = GET_U8_FROM_CDB(cmd, MODE_SELECT_CDB_PAGE_FORMAT_OFFSET);
-	page_format &= MODE_SELECT_CDB_PAGE_FORMAT_MASK;
-
-	save_pages = GET_U8_FROM_CDB(cmd, MODE_SELECT_CDB_SAVE_PAGES_OFFSET);
-	save_pages &= MODE_SELECT_CDB_SAVE_PAGES_MASK;
+	page_format = cmd[1] & MODE_SELECT_CDB_PAGE_FORMAT_MASK;
+	save_pages = cmd[1] & MODE_SELECT_CDB_SAVE_PAGES_MASK;
 
-	if (GET_OPCODE(cmd) == MODE_SELECT) {
-		parm_list_len = GET_U8_FROM_CDB(cmd,
-				MODE_SELECT_6_CDB_PARAM_LIST_LENGTH_OFFSET);
+	if (cmd[0] == MODE_SELECT) {
+		parm_list_len = cmd[4];
 	} else {
-		parm_list_len = GET_U16_FROM_CDB(cmd,
-				MODE_SELECT_10_CDB_PARAM_LIST_LENGTH_OFFSET);
+		parm_list_len = cmd[7];
 		cdb10 = 1;
 	}
 
@@ -2389,42 +1961,36 @@ static int nvme_trans_mode_select(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 		 * According to SPC-4 r24, a paramter list length field of 0
 		 * shall not be considered an error
 		 */
-		res = nvme_trans_modesel_data(ns, hdr, cmd, parm_list_len,
+		return nvme_trans_modesel_data(ns, hdr, cmd, parm_list_len,
 						page_format, save_pages, cdb10);
 	}
 
-	return res;
+	return 0;
 }
 
 static int nvme_trans_mode_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 							u8 *cmd)
 {
-	int res = SNTI_TRANSLATION_SUCCESS;
+	int res = 0;
 	u16 alloc_len;
 	u8 cdb10 = 0;
-	u8 page_code;
-	u8 pc;
 
-	if (GET_OPCODE(cmd) == MODE_SENSE) {
-		alloc_len = GET_U8_FROM_CDB(cmd, MODE_SENSE6_ALLOC_LEN_OFFSET);
+	if (cmd[0] == MODE_SENSE) {
+		alloc_len = cmd[4];
 	} else {
-		alloc_len = GET_U16_FROM_CDB(cmd,
-						MODE_SENSE10_ALLOC_LEN_OFFSET);
+		alloc_len = get_unaligned_be16(&cmd[7]);
 		cdb10 = 1;
 	}
 
-	pc = GET_U8_FROM_CDB(cmd, MODE_SENSE_PAGE_CONTROL_OFFSET) &
-						MODE_SENSE_PAGE_CONTROL_MASK;
-	if (pc != MODE_SENSE_PC_CURRENT_VALUES) {
+	if ((cmd[2] & MODE_SENSE_PAGE_CONTROL_MASK) !=
+			MODE_SENSE_PC_CURRENT_VALUES) {
 		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
 					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
 					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
 		goto out;
 	}
 
-	page_code = GET_U8_FROM_CDB(cmd, MODE_SENSE_PAGE_CODE_OFFSET) &
-					MODE_SENSE_PAGE_CODE_MASK;
-	switch (page_code) {
+	switch (cmd[2] & MODE_SENSE_PAGE_CODE_MASK) {
 	case MODE_PAGE_CACHING:
 		res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len,
 						cdb10,
@@ -2467,47 +2033,34 @@ static int nvme_trans_mode_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 }
 
 static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr,
-							u8 *cmd)
+							u8 *cmd, u8 cdb16)
 {
-	int res = SNTI_TRANSLATION_SUCCESS;
+	int res;
 	int nvme_sc;
-	u32 alloc_len = READ_CAP_10_RESP_SIZE;
-	u32 resp_size = READ_CAP_10_RESP_SIZE;
+	u32 alloc_len;
+	u32 resp_size;
 	u32 xfer_len;
-	u8 cdb16;
 	struct nvme_dev *dev = ns->dev;
-	dma_addr_t dma_addr;
-	void *mem;
 	struct nvme_id_ns *id_ns;
 	u8 *response;
 
-	cdb16 = IS_READ_CAP_16(cmd);
 	if (cdb16) {
-		alloc_len = GET_READ_CAP_16_ALLOC_LENGTH(cmd);
+		alloc_len = get_unaligned_be32(&cmd[10]);
 		resp_size = READ_CAP_16_RESP_SIZE;
+	} else {
+		alloc_len = READ_CAP_10_RESP_SIZE;
+		resp_size = READ_CAP_10_RESP_SIZE;
 	}
 
-	mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
-							&dma_addr, GFP_KERNEL);
-	if (mem == NULL) {
-		res = -ENOMEM;
-		goto out;
-	}
-	/* nvme ns identify */
-	nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+	nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns);
 	res = nvme_trans_status_code(hdr, nvme_sc);
 	if (res)
-		goto out_dma;
-	if (nvme_sc) {
-		res = nvme_sc;
-		goto out_dma;
-	}
-	id_ns = mem;
+		return res;	
 
 	response = kzalloc(resp_size, GFP_KERNEL);
 	if (response == NULL) {
 		res = -ENOMEM;
-		goto out_dma;
+		goto out_free_id;
 	}
 	nvme_trans_fill_read_cap(response, id_ns, cdb16);
 
@@ -2515,72 +2068,53 @@ static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	res = nvme_trans_copy_to_user(hdr, response, xfer_len);
 
 	kfree(response);
- out_dma:
-	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem,
-			  dma_addr);
- out:
+ out_free_id:
+	kfree(id_ns);
 	return res;
 }
 
 static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 							u8 *cmd)
 {
-	int res = SNTI_TRANSLATION_SUCCESS;
+	int res;
 	int nvme_sc;
 	u32 alloc_len, xfer_len, resp_size;
-	u8 select_report;
 	u8 *response;
 	struct nvme_dev *dev = ns->dev;
-	dma_addr_t dma_addr;
-	void *mem;
 	struct nvme_id_ctrl *id_ctrl;
 	u32 ll_length, lun_id;
 	u8 lun_id_offset = REPORT_LUNS_FIRST_LUN_OFFSET;
 	__be32 tmp_len;
 
-	alloc_len = GET_REPORT_LUNS_ALLOC_LENGTH(cmd);
-	select_report = GET_U8_FROM_CDB(cmd, REPORT_LUNS_SR_OFFSET);
-
-	if ((select_report != ALL_LUNS_RETURNED) &&
-	    (select_report != ALL_WELL_KNOWN_LUNS_RETURNED) &&
-	    (select_report != RESTRICTED_LUNS_RETURNED)) {
-		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+	switch (cmd[2]) {
+	default:
+		return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
 					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
 					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
-		goto out;
-	} else {
-		/* NVMe Controller Identify */
-		mem = dma_alloc_coherent(&dev->pci_dev->dev,
-					sizeof(struct nvme_id_ctrl),
-					&dma_addr, GFP_KERNEL);
-		if (mem == NULL) {
-			res = -ENOMEM;
-			goto out;
-		}
-		nvme_sc = nvme_identify(dev, 0, 1, dma_addr);
+	case ALL_LUNS_RETURNED:
+	case ALL_WELL_KNOWN_LUNS_RETURNED:
+	case RESTRICTED_LUNS_RETURNED:
+		nvme_sc = nvme_identify_ctrl(dev, &id_ctrl);
 		res = nvme_trans_status_code(hdr, nvme_sc);
 		if (res)
-			goto out_dma;
-		if (nvme_sc) {
-			res = nvme_sc;
-			goto out_dma;
-		}
-		id_ctrl = mem;
+			return res;
+
 		ll_length = le32_to_cpu(id_ctrl->nn) * LUN_ENTRY_SIZE;
 		resp_size = ll_length + LUN_DATA_HEADER_SIZE;
 
+		alloc_len = get_unaligned_be32(&cmd[6]);
 		if (alloc_len < resp_size) {
 			res = nvme_trans_completion(hdr,
 					SAM_STAT_CHECK_CONDITION,
 					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
 					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
-			goto out_dma;
+			goto out_free_id;
 		}
 
 		response = kzalloc(resp_size, GFP_KERNEL);
 		if (response == NULL) {
 			res = -ENOMEM;
-			goto out_dma;
+			goto out_free_id;
 		}
 
 		/* The first LUN ID will always be 0 per the SAM spec */
@@ -2601,24 +2135,21 @@ static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	res = nvme_trans_copy_to_user(hdr, response, xfer_len);
 
 	kfree(response);
- out_dma:
-	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ctrl), mem,
-			  dma_addr);
- out:
+ out_free_id:
+	kfree(id_ctrl);
 	return res;
 }
 
 static int nvme_trans_request_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 							u8 *cmd)
 {
-	int res = SNTI_TRANSLATION_SUCCESS;
+	int res;
 	u8 alloc_len, xfer_len, resp_size;
 	u8 desc_format;
 	u8 *response;
 
-	alloc_len = GET_REQUEST_SENSE_ALLOC_LENGTH(cmd);
-	desc_format = GET_U8_FROM_CDB(cmd, REQUEST_SENSE_DESC_OFFSET);
-	desc_format &= REQUEST_SENSE_DESC_MASK;
+	desc_format = cmd[1] & 0x01;
+	alloc_len = cmd[4];
 
 	resp_size = ((desc_format) ? (DESC_FMT_SENSE_DATA_SIZE) :
 					(FIXED_FMT_SENSE_DATA_SIZE));
@@ -2628,7 +2159,7 @@ static int nvme_trans_request_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 		goto out;
 	}
 
-	if (desc_format == DESCRIPTOR_FORMAT_SENSE_DATA_TYPE) {
+	if (desc_format) {
 		/* Descriptor Format Sense Data */
 		response[0] = DESC_FORMAT_SENSE_DATA;
 		response[1] = NO_SENSE;
@@ -2667,95 +2198,58 @@ static int nvme_trans_security_protocol(struct nvme_ns *ns,
 				SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
 }
 
-static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr,
-							u8 *cmd)
+static int nvme_trans_synchronize_cache(struct nvme_ns *ns,
+					struct sg_io_hdr *hdr)
 {
-	int res = SNTI_TRANSLATION_SUCCESS;
 	int nvme_sc;
 	struct nvme_command c;
-	u8 immed, pcmod, pc, no_flush, start;
 
-	immed = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_IMMED_OFFSET);
-	pcmod = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_POWER_COND_MOD_OFFSET);
-	pc = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_POWER_COND_OFFSET);
-	no_flush = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_NO_FLUSH_OFFSET);
-	start = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_START_OFFSET);
+	memset(&c, 0, sizeof(c));
+	c.common.opcode = nvme_cmd_flush;
+	c.common.nsid = cpu_to_le32(ns->ns_id);
 
-	immed &= START_STOP_UNIT_CDB_IMMED_MASK;
-	pcmod &= START_STOP_UNIT_CDB_POWER_COND_MOD_MASK;
-	pc = (pc & START_STOP_UNIT_CDB_POWER_COND_MASK) >> NIBBLE_SHIFT;
-	no_flush &= START_STOP_UNIT_CDB_NO_FLUSH_MASK;
-	start &= START_STOP_UNIT_CDB_START_MASK;
+	nvme_sc = nvme_submit_sync_cmd(ns->queue, &c, NULL, 0);
+	return nvme_trans_status_code(hdr, nvme_sc);
+}
+
+static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+							u8 *cmd)
+{
+	u8 immed, pcmod, pc, no_flush, start;
+
+	immed = cmd[1] & 0x01;
+	pcmod = cmd[3] & 0x0f;
+	pc = (cmd[4] & 0xf0) >> 4;
+	no_flush = cmd[4] & 0x04;
+	start = cmd[4] & 0x01;
 
 	if (immed != 0) {
-		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+		return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
 					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
 					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
 	} else {
 		if (no_flush == 0) {
 			/* Issue NVME FLUSH command prior to START STOP UNIT */
-			memset(&c, 0, sizeof(c));
-			c.common.opcode = nvme_cmd_flush;
-			c.common.nsid = cpu_to_le32(ns->ns_id);
-
-			nvme_sc = nvme_submit_io_cmd(ns->dev, ns, &c, NULL);
-			res = nvme_trans_status_code(hdr, nvme_sc);
+			int res = nvme_trans_synchronize_cache(ns, hdr);
 			if (res)
-				goto out;
-			if (nvme_sc) {
-				res = nvme_sc;
-				goto out;
-			}
+				return res;
 		}
 		/* Setup the expected power state transition */
-		res = nvme_trans_power_state(ns, hdr, pc, pcmod, start);
+		return nvme_trans_power_state(ns, hdr, pc, pcmod, start);
 	}
-
- out:
-	return res;
-}
-
-static int nvme_trans_synchronize_cache(struct nvme_ns *ns,
-					struct sg_io_hdr *hdr, u8 *cmd)
-{
-	int res = SNTI_TRANSLATION_SUCCESS;
-	int nvme_sc;
-	struct nvme_command c;
-
-	memset(&c, 0, sizeof(c));
-	c.common.opcode = nvme_cmd_flush;
-	c.common.nsid = cpu_to_le32(ns->ns_id);
-
-	nvme_sc = nvme_submit_io_cmd(ns->dev, ns, &c, NULL);
-
-	res = nvme_trans_status_code(hdr, nvme_sc);
-	if (res)
-		goto out;
-	if (nvme_sc)
-		res = nvme_sc;
-
- out:
-	return res;
 }
 
 static int nvme_trans_format_unit(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 							u8 *cmd)
 {
-	int res = SNTI_TRANSLATION_SUCCESS;
+	int res;
 	u8 parm_hdr_len = 0;
 	u8 nvme_pf_code = 0;
 	u8 format_prot_info, long_list, format_data;
 
-	format_prot_info = GET_U8_FROM_CDB(cmd,
-				FORMAT_UNIT_CDB_FORMAT_PROT_INFO_OFFSET);
-	long_list = GET_U8_FROM_CDB(cmd, FORMAT_UNIT_CDB_LONG_LIST_OFFSET);
-	format_data = GET_U8_FROM_CDB(cmd, FORMAT_UNIT_CDB_FORMAT_DATA_OFFSET);
-
-	format_prot_info = (format_prot_info &
-				FORMAT_UNIT_CDB_FORMAT_PROT_INFO_MASK) >>
-				FORMAT_UNIT_CDB_FORMAT_PROT_INFO_SHIFT;
-	long_list &= FORMAT_UNIT_CDB_LONG_LIST_MASK;
-	format_data &= FORMAT_UNIT_CDB_FORMAT_DATA_MASK;
+	format_prot_info = (cmd[1] & 0xc0) >> 6;
+	long_list = cmd[1] & 0x20;
+	format_data = cmd[1] & 0x10;
 
 	if (format_data != 0) {
 		if (format_prot_info != 0) {
@@ -2779,16 +2273,16 @@ static int nvme_trans_format_unit(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	if (parm_hdr_len > 0) {
 		res = nvme_trans_fmt_get_parm_header(hdr, parm_hdr_len,
 					format_prot_info, &nvme_pf_code);
-		if (res != SNTI_TRANSLATION_SUCCESS)
+		if (res)
 			goto out;
 	}
 
 	/* Attempt to activate any previously downloaded firmware image */
-	res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_activate_fw, 0, 0, 0);
+	res = nvme_trans_send_activate_fw_cmd(ns, hdr, 0);
 
 	/* Determine Block size and count and send format command */
 	res = nvme_trans_fmt_set_blk_size_count(ns, hdr);
-	if (res != SNTI_TRANSLATION_SUCCESS)
+	if (res)
 		goto out;
 
 	res = nvme_trans_fmt_send_cmd(ns, hdr, nvme_pf_code);
@@ -2801,28 +2295,24 @@ static int nvme_trans_test_unit_ready(struct nvme_ns *ns,
 					struct sg_io_hdr *hdr,
 					u8 *cmd)
 {
-	int res = SNTI_TRANSLATION_SUCCESS;
 	struct nvme_dev *dev = ns->dev;
 
 	if (!(readl(&dev->bar->csts) & NVME_CSTS_RDY))
-		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+		return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
 					    NOT_READY, SCSI_ASC_LUN_NOT_READY,
 					    SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
 	else
-		res = nvme_trans_completion(hdr, SAM_STAT_GOOD, NO_SENSE, 0, 0);
-
-	return res;
+		return nvme_trans_completion(hdr, SAM_STAT_GOOD, NO_SENSE, 0, 0);
 }
 
 static int nvme_trans_write_buffer(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 							u8 *cmd)
 {
-	int res = SNTI_TRANSLATION_SUCCESS;
+	int res = 0;
 	u32 buffer_offset, parm_list_length;
 	u8 buffer_id, mode;
 
-	parm_list_length =
-		GET_U24_FROM_CDB(cmd, WRITE_BUFFER_CDB_PARM_LIST_LENGTH_OFFSET);
+	parm_list_length = get_unaligned_be24(&cmd[6]);
 	if (parm_list_length % BYTES_TO_DWORDS != 0) {
 		/* NVMe expects Firmware file to be a whole number of DWORDS */
 		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
@@ -2830,38 +2320,32 @@ static int nvme_trans_write_buffer(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
 		goto out;
 	}
-	buffer_id = GET_U8_FROM_CDB(cmd, WRITE_BUFFER_CDB_BUFFER_ID_OFFSET);
+	buffer_id = cmd[2];
 	if (buffer_id > NVME_MAX_FIRMWARE_SLOT) {
 		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
 					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
 					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
 		goto out;
 	}
-	mode = GET_U8_FROM_CDB(cmd, WRITE_BUFFER_CDB_MODE_OFFSET) &
-						WRITE_BUFFER_CDB_MODE_MASK;
-	buffer_offset =
-		GET_U24_FROM_CDB(cmd, WRITE_BUFFER_CDB_BUFFER_OFFSET_OFFSET);
+	mode = cmd[1] & 0x1f;
+	buffer_offset = get_unaligned_be24(&cmd[3]);
 
 	switch (mode) {
 	case DOWNLOAD_SAVE_ACTIVATE:
-		res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_download_fw,
+		res = nvme_trans_send_download_fw_cmd(ns, hdr, nvme_admin_download_fw,
 						parm_list_length, buffer_offset,
 						buffer_id);
-		if (res != SNTI_TRANSLATION_SUCCESS)
+		if (res)
 			goto out;
-		res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_activate_fw,
-						parm_list_length, buffer_offset,
-						buffer_id);
+		res = nvme_trans_send_activate_fw_cmd(ns, hdr, buffer_id);
 		break;
 	case DOWNLOAD_SAVE_DEFER_ACTIVATE:
-		res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_download_fw,
+		res = nvme_trans_send_download_fw_cmd(ns, hdr, nvme_admin_download_fw,
 						parm_list_length, buffer_offset,
 						buffer_id);
 		break;
 	case ACTIVATE_DEFERRED_MICROCODE:
-		res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_activate_fw,
-						parm_list_length, buffer_offset,
-						buffer_id);
+		res = nvme_trans_send_activate_fw_cmd(ns, hdr, buffer_id);
 		break;
 	default:
 		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
@@ -2890,15 +2374,13 @@ struct scsi_unmap_parm_list {
 static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 							u8 *cmd)
 {
-	struct nvme_dev *dev = ns->dev;
 	struct scsi_unmap_parm_list *plist;
 	struct nvme_dsm_range *range;
 	struct nvme_command c;
 	int i, nvme_sc, res = -ENOMEM;
 	u16 ndesc, list_len;
-	dma_addr_t dma_addr;
 
-	list_len = GET_U16_FROM_CDB(cmd, UNMAP_CDB_PARAM_LIST_LENGTH_OFFSET);
+	list_len = get_unaligned_be16(&cmd[7]);
 	if (!list_len)
 		return -EINVAL;
 
@@ -2907,7 +2389,7 @@ static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 		return -ENOMEM;
 
 	res = nvme_trans_copy_from_user(hdr, plist, list_len);
-	if (res != SNTI_TRANSLATION_SUCCESS)
+	if (res)
 		goto out;
 
 	ndesc = be16_to_cpu(plist->unmap_blk_desc_data_len) >> 4;
@@ -2916,8 +2398,7 @@ static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 		goto out;
 	}
 
-	range = dma_alloc_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range),
-							&dma_addr, GFP_KERNEL);
+	range = kcalloc(ndesc, sizeof(*range), GFP_KERNEL);
 	if (!range)
 		goto out;
 
@@ -2930,15 +2411,14 @@ static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	memset(&c, 0, sizeof(c));
 	c.dsm.opcode = nvme_cmd_dsm;
 	c.dsm.nsid = cpu_to_le32(ns->ns_id);
-	c.dsm.prp1 = cpu_to_le64(dma_addr);
 	c.dsm.nr = cpu_to_le32(ndesc - 1);
 	c.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
 
-	nvme_sc = nvme_submit_io_cmd(dev, ns, &c, NULL);
+	nvme_sc = nvme_submit_sync_cmd(ns->queue, &c, range,
+			ndesc * sizeof(*range));
 	res = nvme_trans_status_code(hdr, nvme_sc);
 
-	dma_free_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range),
-							range, dma_addr);
+	kfree(range);
  out:
 	kfree(plist);
 	return res;
@@ -2993,13 +2473,16 @@ static int nvme_scsi_translate(struct nvme_ns *ns, struct sg_io_hdr *hdr)
 		retcode = nvme_trans_mode_sense(ns, hdr, cmd);
 		break;
 	case READ_CAPACITY:
-		retcode = nvme_trans_read_capacity(ns, hdr, cmd);
+		retcode = nvme_trans_read_capacity(ns, hdr, cmd, 0);
 		break;
 	case SERVICE_ACTION_IN_16:
-		if (IS_READ_CAP_16(cmd))
-			retcode = nvme_trans_read_capacity(ns, hdr, cmd);
-		else
+		switch (cmd[1]) {
+		case SAI_READ_CAPACITY_16:
+			retcode = nvme_trans_read_capacity(ns, hdr, cmd, 1);
+			break;
+		default:
 			goto out;
+		}
 		break;
 	case REPORT_LUNS:
 		retcode = nvme_trans_report_luns(ns, hdr, cmd);
@@ -3015,7 +2498,7 @@ static int nvme_scsi_translate(struct nvme_ns *ns, struct sg_io_hdr *hdr)
 		retcode = nvme_trans_start_stop(ns, hdr, cmd);
 		break;
 	case SYNCHRONIZE_CACHE:
-		retcode = nvme_trans_synchronize_cache(ns, hdr, cmd);
+		retcode = nvme_trans_synchronize_cache(ns, hdr);
 		break;
 	case FORMAT_UNIT:
 		retcode = nvme_trans_format_unit(ns, hdr, cmd);
@@ -3053,15 +2536,16 @@ int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr)
 	if (hdr.cmd_len > BLK_MAX_CDB)
 		return -EINVAL;
 
+	/*
+	 * A positive return code means a NVMe status, which has been
+	 * translated to sense data.
+	 */
 	retcode = nvme_scsi_translate(ns, &hdr);
 	if (retcode < 0)
 		return retcode;
-	if (retcode > 0)
-		retcode = SNTI_TRANSLATION_SUCCESS;
 	if (copy_to_user(u_hdr, &hdr, sizeof(sg_io_hdr_t)) > 0)
 		return -EFAULT;
-
-	return retcode;
+	return 0;
 }
 
 int nvme_sg_get_version_num(int __user *ip)
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index d48715b287e667..dbb4da1cdca8b2 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -442,7 +442,7 @@ static char *pd_buf;		/* buffer for request in progress */
 
 static enum action do_pd_io_start(void)
 {
-	if (pd_req->cmd_type == REQ_TYPE_SPECIAL) {
+	if (pd_req->cmd_type == REQ_TYPE_DRV_PRIV) {
 		phase = pd_special;
 		return pd_special();
 	}
@@ -725,7 +725,7 @@ static int pd_special_command(struct pd_unit *disk,
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);
 
-	rq->cmd_type = REQ_TYPE_SPECIAL;
+	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	rq->special = func;
 
 	err = blk_execute_rq(disk->gd->queue, disk->gd, rq, 0);
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 09e628dafd9d82..4c20c228184c32 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -61,6 +61,7 @@
 #include <linux/freezer.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
+#include <linux/backing-dev.h>
 #include <scsi/scsi_cmnd.h>
 #include <scsi/scsi_ioctl.h>
 #include <scsi/scsi.h>
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 5d552857de412e..59c91d49b14b64 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -620,7 +620,7 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx)
 	spin_unlock_irq(&host->lock);
 
 	DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx);
-	crq->rq->cmd_type = REQ_TYPE_SPECIAL;
+	crq->rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	crq->rq->special = crq;
 	blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);
 
@@ -661,7 +661,7 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func)
 	crq->msg_bucket = (u32) rc;
 
 	DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx);
-	crq->rq->cmd_type = REQ_TYPE_SPECIAL;
+	crq->rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	crq->rq->special = crq;
 	blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);
 
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 5ea2f0bbbc7c3d..d4d05f064d3907 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -124,7 +124,7 @@ static inline void virtblk_request_done(struct request *req)
 		req->resid_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.residual);
 		req->sense_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.sense_len);
 		req->errors = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.errors);
-	} else if (req->cmd_type == REQ_TYPE_SPECIAL) {
+	} else if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
 		req->errors = (error != 0);
 	}
 
@@ -188,7 +188,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
 			vbr->out_hdr.sector = 0;
 			vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req));
 			break;
-		case REQ_TYPE_SPECIAL:
+		case REQ_TYPE_DRV_PRIV:
 			vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_GET_ID);
 			vbr->out_hdr.sector = 0;
 			vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req));
@@ -251,7 +251,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
 		return PTR_ERR(req);
 	}
 
-	req->cmd_type = REQ_TYPE_SPECIAL;
+	req->cmd_type = REQ_TYPE_DRV_PRIV;
 	err = blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
 	blk_put_request(req);
 
diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index 5fc291c6157e31..60316fbaf2957d 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -12,6 +12,7 @@
 #include <linux/fs.h>
 #include <linux/major.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/module.h>
 #include <linux/raw.h>
 #include <linux/capability.h>
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index fac3d9da2e07db..1362ad80a76c07 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -93,7 +93,7 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
 	int error;
 
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
-	rq->cmd_type = REQ_TYPE_SPECIAL;
+	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	rq->special = (char *)pc;
 
 	if (buf && bufflen) {
@@ -191,7 +191,7 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 
 	BUG_ON(sense_len > sizeof(*sense));
 
-	if (rq->cmd_type == REQ_TYPE_SENSE || drive->sense_rq_armed)
+	if (rq->cmd_type == REQ_TYPE_ATA_SENSE || drive->sense_rq_armed)
 		return;
 
 	memset(sense, 0, sizeof(*sense));
@@ -210,7 +210,7 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 	sense_rq->rq_disk = rq->rq_disk;
 	sense_rq->cmd[0] = GPCMD_REQUEST_SENSE;
 	sense_rq->cmd[4] = cmd_len;
-	sense_rq->cmd_type = REQ_TYPE_SENSE;
+	sense_rq->cmd_type = REQ_TYPE_ATA_SENSE;
 	sense_rq->cmd_flags |= REQ_PREEMPT;
 
 	if (drive->media == ide_tape)
@@ -310,7 +310,7 @@ int ide_cd_get_xferlen(struct request *rq)
 	switch (rq->cmd_type) {
 	case REQ_TYPE_FS:
 		return 32768;
-	case REQ_TYPE_SENSE:
+	case REQ_TYPE_ATA_SENSE:
 	case REQ_TYPE_BLOCK_PC:
 	case REQ_TYPE_ATA_PC:
 		return blk_rq_bytes(rq);
@@ -477,7 +477,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		if (uptodate == 0)
 			drive->failed_pc = NULL;
 
-		if (rq->cmd_type == REQ_TYPE_SPECIAL) {
+		if (rq->cmd_type == REQ_TYPE_DRV_PRIV) {
 			rq->errors = 0;
 			error = 0;
 		} else {
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 0b510bafd90e29..64a6b827b3dd12 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -210,7 +210,7 @@ static void cdrom_analyze_sense_data(ide_drive_t *drive,
 static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq)
 {
 	/*
-	 * For REQ_TYPE_SENSE, "rq->special" points to the original
+	 * For REQ_TYPE_ATA_SENSE, "rq->special" points to the original
 	 * failed request.  Also, the sense data should be read
 	 * directly from rq which might be different from the original
 	 * sense buffer if it got copied during mapping.
@@ -285,7 +285,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 				  "stat 0x%x",
 				  rq->cmd[0], rq->cmd_type, err, stat);
 
-	if (rq->cmd_type == REQ_TYPE_SENSE) {
+	if (rq->cmd_type == REQ_TYPE_ATA_SENSE) {
 		/*
 		 * We got an error trying to get sense info from the drive
 		 * (probably while trying to recover from a former error).
@@ -526,7 +526,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 	ide_expiry_t *expiry = NULL;
 	int dma_error = 0, dma, thislen, uptodate = 0;
 	int write = (rq_data_dir(rq) == WRITE) ? 1 : 0, rc = 0;
-	int sense = (rq->cmd_type == REQ_TYPE_SENSE);
+	int sense = (rq->cmd_type == REQ_TYPE_ATA_SENSE);
 	unsigned int timeout;
 	u16 len;
 	u8 ireason, stat;
@@ -791,7 +791,7 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 		if (cdrom_start_rw(drive, rq) == ide_stopped)
 			goto out_end;
 		break;
-	case REQ_TYPE_SENSE:
+	case REQ_TYPE_ATA_SENSE:
 	case REQ_TYPE_BLOCK_PC:
 	case REQ_TYPE_ATA_PC:
 		if (!rq->timeout)
@@ -799,7 +799,7 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 
 		cdrom_do_block_pc(drive, rq);
 		break;
-	case REQ_TYPE_SPECIAL:
+	case REQ_TYPE_DRV_PRIV:
 		/* right now this can only be a reset... */
 		uptodate = 1;
 		goto out_end;
diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c
index 02caa7dd51c83f..066e3903651842 100644
--- a/drivers/ide/ide-cd_ioctl.c
+++ b/drivers/ide/ide-cd_ioctl.c
@@ -304,7 +304,7 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi)
 	int ret;
 
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
-	rq->cmd_type = REQ_TYPE_SPECIAL;
+	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	rq->cmd_flags = REQ_QUIET;
 	ret = blk_execute_rq(drive->queue, cd->disk, rq, 0);
 	blk_put_request(rq);
diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c
index 9e98122f646e31..b05a74d78ef560 100644
--- a/drivers/ide/ide-devsets.c
+++ b/drivers/ide/ide-devsets.c
@@ -166,7 +166,7 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting,
 		return setting->set(drive, arg);
 
 	rq = blk_get_request(q, READ, __GFP_WAIT);
-	rq->cmd_type = REQ_TYPE_SPECIAL;
+	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	rq->cmd_len = 5;
 	rq->cmd[0] = REQ_DEVSET_EXEC;
 	*(int *)&rq->cmd[1] = arg;
diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c
index 32970664c27544..d6da011299f582 100644
--- a/drivers/ide/ide-eh.c
+++ b/drivers/ide/ide-eh.c
@@ -129,7 +129,7 @@ ide_startstop_t ide_error(ide_drive_t *drive, const char *msg, u8 stat)
 
 			if (cmd)
 				ide_complete_cmd(drive, cmd, stat, err);
-		} else if (blk_pm_request(rq)) {
+		} else if (ata_pm_request(rq)) {
 			rq->errors = 1;
 			ide_complete_pm_rq(drive, rq);
 			return ide_stopped;
@@ -147,7 +147,7 @@ static inline void ide_complete_drive_reset(ide_drive_t *drive, int err)
 {
 	struct request *rq = drive->hwif->rq;
 
-	if (rq && rq->cmd_type == REQ_TYPE_SPECIAL &&
+	if (rq && rq->cmd_type == REQ_TYPE_DRV_PRIV &&
 	    rq->cmd[0] == REQ_DRIVE_RESET) {
 		if (err <= 0 && rq->errors == 0)
 			rq->errors = -EIO;
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 8c6363cdd2084a..2fb5350c541050 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -97,7 +97,7 @@ static int ide_floppy_callback(ide_drive_t *drive, int dsc)
 			       "Aborting request!\n");
 	}
 
-	if (rq->cmd_type == REQ_TYPE_SPECIAL)
+	if (rq->cmd_type == REQ_TYPE_DRV_PRIV)
 		rq->errors = uptodate ? 0 : IDE_DRV_ERROR_GENERAL;
 
 	return uptodate;
@@ -246,7 +246,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 		} else
 			printk(KERN_ERR PFX "%s: I/O error\n", drive->name);
 
-		if (rq->cmd_type == REQ_TYPE_SPECIAL) {
+		if (rq->cmd_type == REQ_TYPE_DRV_PRIV) {
 			rq->errors = 0;
 			ide_complete_rq(drive, 0, blk_rq_bytes(rq));
 			return ide_stopped;
@@ -265,8 +265,8 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 		pc = &floppy->queued_pc;
 		idefloppy_create_rw_cmd(drive, pc, rq, (unsigned long)block);
 		break;
-	case REQ_TYPE_SPECIAL:
-	case REQ_TYPE_SENSE:
+	case REQ_TYPE_DRV_PRIV:
+	case REQ_TYPE_ATA_SENSE:
 		pc = (struct ide_atapi_pc *)rq->special;
 		break;
 	case REQ_TYPE_BLOCK_PC:
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 177db6d5b2f589..669ea1e4579586 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -135,7 +135,7 @@ EXPORT_SYMBOL(ide_complete_rq);
 
 void ide_kill_rq(ide_drive_t *drive, struct request *rq)
 {
-	u8 drv_req = (rq->cmd_type == REQ_TYPE_SPECIAL) && rq->rq_disk;
+	u8 drv_req = (rq->cmd_type == REQ_TYPE_DRV_PRIV) && rq->rq_disk;
 	u8 media = drive->media;
 
 	drive->failed_pc = NULL;
@@ -320,7 +320,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 		goto kill_rq;
 	}
 
-	if (blk_pm_request(rq))
+	if (ata_pm_request(rq))
 		ide_check_pm_state(drive, rq);
 
 	drive->hwif->tp_ops->dev_select(drive);
@@ -342,8 +342,8 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 
 		if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE)
 			return execute_drive_cmd(drive, rq);
-		else if (blk_pm_request(rq)) {
-			struct request_pm_state *pm = rq->special;
+		else if (ata_pm_request(rq)) {
+			struct ide_pm_state *pm = rq->special;
 #ifdef DEBUG_PM
 			printk("%s: start_power_step(step: %d)\n",
 				drive->name, pm->pm_step);
@@ -353,7 +353,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 			    pm->pm_step == IDE_PM_COMPLETED)
 				ide_complete_pm_rq(drive, rq);
 			return startstop;
-		} else if (!rq->rq_disk && rq->cmd_type == REQ_TYPE_SPECIAL)
+		} else if (!rq->rq_disk && rq->cmd_type == REQ_TYPE_DRV_PRIV)
 			/*
 			 * TODO: Once all ULDs have been modified to
 			 * check for specific op codes rather than
@@ -538,7 +538,7 @@ repeat:
 		 * state machine.
 		 */
 		if ((drive->dev_flags & IDE_DFLAG_BLOCKED) &&
-		    blk_pm_request(rq) == 0 &&
+		    ata_pm_request(rq) == 0 &&
 		    (rq->cmd_flags & REQ_PREEMPT) == 0) {
 			/* there should be no pending command at this point */
 			ide_unlock_port(hwif);
diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c
index 6233fa2cb8a978..aa2e9b77b20d39 100644
--- a/drivers/ide/ide-ioctls.c
+++ b/drivers/ide/ide-ioctls.c
@@ -222,7 +222,7 @@ static int generic_drive_reset(ide_drive_t *drive)
 	int ret = 0;
 
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
-	rq->cmd_type = REQ_TYPE_SPECIAL;
+	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	rq->cmd_len = 1;
 	rq->cmd[0] = REQ_DRIVE_RESET;
 	if (blk_execute_rq(drive->queue, NULL, rq, 1))
diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index ca958604cda212..c808685204883d 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -34,7 +34,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 	rq = blk_get_request(q, READ, __GFP_WAIT);
 	rq->cmd[0] = REQ_PARK_HEADS;
 	rq->cmd_len = 1;
-	rq->cmd_type = REQ_TYPE_SPECIAL;
+	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	rq->special = &timeout;
 	rc = blk_execute_rq(q, NULL, rq, 1);
 	blk_put_request(rq);
@@ -51,7 +51,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 
 	rq->cmd[0] = REQ_UNPARK_HEADS;
 	rq->cmd_len = 1;
-	rq->cmd_type = REQ_TYPE_SPECIAL;
+	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	elv_add_request(q, rq, ELEVATOR_INSERT_FRONT);
 
 out:
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index 8d1e32d7cd9767..081e43458d50f7 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -8,7 +8,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 	ide_drive_t *pair = ide_get_pair_dev(drive);
 	ide_hwif_t *hwif = drive->hwif;
 	struct request *rq;
-	struct request_pm_state rqpm;
+	struct ide_pm_state rqpm;
 	int ret;
 
 	if (ide_port_acpi(hwif)) {
@@ -19,7 +19,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 
 	memset(&rqpm, 0, sizeof(rqpm));
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
-	rq->cmd_type = REQ_TYPE_PM_SUSPEND;
+	rq->cmd_type = REQ_TYPE_ATA_PM_SUSPEND;
 	rq->special = &rqpm;
 	rqpm.pm_step = IDE_PM_START_SUSPEND;
 	if (mesg.event == PM_EVENT_PRETHAW)
@@ -38,13 +38,43 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 	return ret;
 }
 
+static void ide_end_sync_rq(struct request *rq, int error)
+{
+	complete(rq->end_io_data);
+}
+
+static int ide_pm_execute_rq(struct request *rq)
+{
+	struct request_queue *q = rq->q;
+	DECLARE_COMPLETION_ONSTACK(wait);
+
+	rq->end_io_data = &wait;
+	rq->end_io = ide_end_sync_rq;
+
+	spin_lock_irq(q->queue_lock);
+	if (unlikely(blk_queue_dying(q))) {
+		rq->cmd_flags |= REQ_QUIET;
+		rq->errors = -ENXIO;
+		__blk_end_request_all(rq, rq->errors);
+		spin_unlock_irq(q->queue_lock);
+		return -ENXIO;
+	}
+	__elv_add_request(q, rq, ELEVATOR_INSERT_FRONT);
+	__blk_run_queue_uncond(q);
+	spin_unlock_irq(q->queue_lock);
+
+	wait_for_completion_io(&wait);
+
+	return rq->errors ? -EIO : 0;
+}
+
 int generic_ide_resume(struct device *dev)
 {
 	ide_drive_t *drive = to_ide_device(dev);
 	ide_drive_t *pair = ide_get_pair_dev(drive);
 	ide_hwif_t *hwif = drive->hwif;
 	struct request *rq;
-	struct request_pm_state rqpm;
+	struct ide_pm_state rqpm;
 	int err;
 
 	if (ide_port_acpi(hwif)) {
@@ -59,13 +89,13 @@ int generic_ide_resume(struct device *dev)
 
 	memset(&rqpm, 0, sizeof(rqpm));
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
-	rq->cmd_type = REQ_TYPE_PM_RESUME;
+	rq->cmd_type = REQ_TYPE_ATA_PM_RESUME;
 	rq->cmd_flags |= REQ_PREEMPT;
 	rq->special = &rqpm;
 	rqpm.pm_step = IDE_PM_START_RESUME;
 	rqpm.pm_state = PM_EVENT_ON;
 
-	err = blk_execute_rq(drive->queue, NULL, rq, 1);
+	err = ide_pm_execute_rq(rq);
 	blk_put_request(rq);
 
 	if (err == 0 && dev->driver) {
@@ -80,7 +110,7 @@ int generic_ide_resume(struct device *dev)
 
 void ide_complete_power_step(ide_drive_t *drive, struct request *rq)
 {
-	struct request_pm_state *pm = rq->special;
+	struct ide_pm_state *pm = rq->special;
 
 #ifdef DEBUG_PM
 	printk(KERN_INFO "%s: complete_power_step(step: %d)\n",
@@ -110,7 +140,7 @@ void ide_complete_power_step(ide_drive_t *drive, struct request *rq)
 
 ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
 {
-	struct request_pm_state *pm = rq->special;
+	struct ide_pm_state *pm = rq->special;
 	struct ide_cmd cmd = { };
 
 	switch (pm->pm_step) {
@@ -182,7 +212,7 @@ out_do_tf:
 void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
 {
 	struct request_queue *q = drive->queue;
-	struct request_pm_state *pm = rq->special;
+	struct ide_pm_state *pm = rq->special;
 	unsigned long flags;
 
 	ide_complete_power_step(drive, rq);
@@ -191,10 +221,10 @@ void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
 
 #ifdef DEBUG_PM
 	printk("%s: completing PM request, %s\n", drive->name,
-	       (rq->cmd_type == REQ_TYPE_PM_SUSPEND) ? "suspend" : "resume");
+	       (rq->cmd_type == REQ_TYPE_ATA_PM_SUSPEND) ? "suspend" : "resume");
 #endif
 	spin_lock_irqsave(q->queue_lock, flags);
-	if (rq->cmd_type == REQ_TYPE_PM_SUSPEND)
+	if (rq->cmd_type == REQ_TYPE_ATA_PM_SUSPEND)
 		blk_stop_queue(q);
 	else
 		drive->dev_flags &= ~IDE_DFLAG_BLOCKED;
@@ -208,13 +238,13 @@ void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
 
 void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
 {
-	struct request_pm_state *pm = rq->special;
+	struct ide_pm_state *pm = rq->special;
 
-	if (rq->cmd_type == REQ_TYPE_PM_SUSPEND &&
+	if (rq->cmd_type == REQ_TYPE_ATA_PM_SUSPEND &&
 	    pm->pm_step == IDE_PM_START_SUSPEND)
 		/* Mark drive blocked when starting the suspend sequence. */
 		drive->dev_flags |= IDE_DFLAG_BLOCKED;
-	else if (rq->cmd_type == REQ_TYPE_PM_RESUME &&
+	else if (rq->cmd_type == REQ_TYPE_ATA_PM_RESUME &&
 		 pm->pm_step == IDE_PM_START_RESUME) {
 		/*
 		 * The first thing we do on wakeup is to wait for BSY bit to
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 6eb738ca6d2f35..f5d51d1d09ee48 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -576,8 +576,8 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 		      rq->cmd[0], (unsigned long long)blk_rq_pos(rq),
 		      blk_rq_sectors(rq));
 
-	BUG_ON(!(rq->cmd_type == REQ_TYPE_SPECIAL ||
-		 rq->cmd_type == REQ_TYPE_SENSE));
+	BUG_ON(!(rq->cmd_type == REQ_TYPE_DRV_PRIV ||
+		 rq->cmd_type == REQ_TYPE_ATA_SENSE));
 
 	/* Retry a failed packet command */
 	if (drive->failed_pc && drive->pc->c[0] == REQUEST_SENSE) {
@@ -853,7 +853,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size)
 	BUG_ON(size < 0 || size % tape->blk_size);
 
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
-	rq->cmd_type = REQ_TYPE_SPECIAL;
+	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	rq->cmd[13] = cmd;
 	rq->rq_disk = tape->disk;
 	rq->__sector = tape->first_frame;
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index dabb88b1cbec69..0979e126fff1e6 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -186,7 +186,7 @@ static ide_startstop_t task_no_data_intr(ide_drive_t *drive)
 	    tf->command == ATA_CMD_CHK_POWER) {
 		struct request *rq = hwif->rq;
 
-		if (blk_pm_request(rq))
+		if (ata_pm_request(rq))
 			ide_complete_pm_rq(drive, rq);
 		else
 			ide_finish_cmd(drive, cmd, stat);
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index fa028fa82df41b..cb64e64a478954 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -55,7 +55,7 @@ static void bch_bio_submit_split_done(struct closure *cl)
 
 	s->bio->bi_end_io = s->bi_end_io;
 	s->bio->bi_private = s->bi_private;
-	bio_endio_nodec(s->bio, 0);
+	bio_endio(s->bio, 0);
 
 	closure_debug_destroy(&s->cl);
 	mempool_free(s, s->p->bio_split_hook);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index ab43faddb447e3..4afb2d26b148a3 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/hash.h>
 #include <linux/random.h>
+#include <linux/backing-dev.h>
 
 #include <trace/events/bcache.h>
 
@@ -619,7 +620,7 @@ static void do_bio_hook(struct search *s, struct bio *orig_bio)
 	bio->bi_end_io		= request_endio;
 	bio->bi_private		= &s->cl;
 
-	atomic_set(&bio->bi_cnt, 3);
+	bio_cnt_set(bio, 3);
 }
 
 static void search_free(struct closure *cl)
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 7755af35186762..41b2594a80c630 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -86,12 +86,6 @@ static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
 {
 	bio->bi_end_io = h->bi_end_io;
 	bio->bi_private = h->bi_private;
-
-	/*
-	 * Must bump bi_remaining to allow bio to complete with
-	 * restored bi_end_io.
-	 */
-	atomic_inc(&bio->bi_remaining);
 }
 
 /*----------------------------------------------------------------*/
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 089d62751f7ff2..743fa9bbae9eaa 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1254,8 +1254,6 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
 			dm_bio_restore(bd, bio);
 			bio_record->details.bi_bdev = NULL;
 
-			atomic_inc(&bio->bi_remaining);
-
 			queue_bio(ms, bio, rw);
 			return DM_ENDIO_INCOMPLETE;
 		}
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index f83a0f3fc36566..7c82d3ccce871f 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1478,7 +1478,6 @@ out:
 	if (full_bio) {
 		full_bio->bi_end_io = pe->full_bio_end_io;
 		full_bio->bi_private = pe->full_bio_private;
-		atomic_inc(&full_bio->bi_remaining);
 	}
 	increment_pending_exceptions_done_count();
 
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 16ba55ad708992..a5f94125ad01f6 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -942,21 +942,28 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *
 {
 	unsigned type = dm_table_get_type(t);
 	unsigned per_bio_data_size = 0;
-	struct dm_target *tgt;
 	unsigned i;
 
-	if (unlikely(type == DM_TYPE_NONE)) {
+	switch (type) {
+	case DM_TYPE_BIO_BASED:
+		for (i = 0; i < t->num_targets; i++) {
+			struct dm_target *tgt = t->targets + i;
+
+			per_bio_data_size = max(per_bio_data_size,
+						tgt->per_bio_data_size);
+		}
+		t->mempools = dm_alloc_bio_mempools(t->integrity_supported,
+						    per_bio_data_size);
+		break;
+	case DM_TYPE_REQUEST_BASED:
+	case DM_TYPE_MQ_REQUEST_BASED:
+		t->mempools = dm_alloc_rq_mempools(md, type);
+		break;
+	default:
 		DMWARN("no table type is set, can't allocate mempools");
 		return -EINVAL;
 	}
 
-	if (type == DM_TYPE_BIO_BASED)
-		for (i = 0; i < t->num_targets; i++) {
-			tgt = t->targets + i;
-			per_bio_data_size = max(per_bio_data_size, tgt->per_bio_data_size);
-		}
-
-	t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, per_bio_data_size);
 	if (!t->mempools)
 		return -ENOMEM;
 
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 921aafd12aee67..e852602c009198 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -793,10 +793,9 @@ static void inc_remap_and_issue_cell(struct thin_c *tc,
 
 static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
 {
-	if (m->bio) {
+	if (m->bio)
 		m->bio->bi_end_io = m->saved_bi_end_io;
-		atomic_inc(&m->bio->bi_remaining);
-	}
+
 	cell_error(m->tc->pool, m->cell);
 	list_del(&m->list);
 	mempool_free(m, m->tc->pool->mapping_pool);
@@ -810,10 +809,8 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 	int r;
 
 	bio = m->bio;
-	if (bio) {
+	if (bio)
 		bio->bi_end_io = m->saved_bi_end_io;
-		atomic_inc(&bio->bi_remaining);
-	}
 
 	if (m->err) {
 		cell_error(pool, m->cell);
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index 66616db33e6fdb..bb9c6a00e4b05e 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -459,7 +459,7 @@ static void verity_finish_io(struct dm_verity_io *io, int error)
 	bio->bi_end_io = io->orig_bi_end_io;
 	bio->bi_private = io->orig_bi_private;
 
-	bio_endio_nodec(bio, error);
+	bio_endio(bio, error);
 }
 
 static void verity_work(struct work_struct *w)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 2caf492890d64b..d72829922eb6c8 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -990,57 +990,6 @@ static void clone_endio(struct bio *bio, int error)
 	dec_pending(io, error);
 }
 
-/*
- * Partial completion handling for request-based dm
- */
-static void end_clone_bio(struct bio *clone, int error)
-{
-	struct dm_rq_clone_bio_info *info =
-		container_of(clone, struct dm_rq_clone_bio_info, clone);
-	struct dm_rq_target_io *tio = info->tio;
-	struct bio *bio = info->orig;
-	unsigned int nr_bytes = info->orig->bi_iter.bi_size;
-
-	bio_put(clone);
-
-	if (tio->error)
-		/*
-		 * An error has already been detected on the request.
-		 * Once error occurred, just let clone->end_io() handle
-		 * the remainder.
-		 */
-		return;
-	else if (error) {
-		/*
-		 * Don't notice the error to the upper layer yet.
-		 * The error handling decision is made by the target driver,
-		 * when the request is completed.
-		 */
-		tio->error = error;
-		return;
-	}
-
-	/*
-	 * I/O for the bio successfully completed.
-	 * Notice the data completion to the upper layer.
-	 */
-
-	/*
-	 * bios are processed from the head of the list.
-	 * So the completing bio should always be rq->bio.
-	 * If it's not, something wrong is happening.
-	 */
-	if (tio->orig->bio != bio)
-		DMERR("bio completion is going in the middle of the request");
-
-	/*
-	 * Update the original request.
-	 * Do not use blk_end_request() here, because it may complete
-	 * the original request before the clone, and break the ordering.
-	 */
-	blk_update_request(tio->orig, 0, nr_bytes);
-}
-
 static struct dm_rq_target_io *tio_from_request(struct request *rq)
 {
 	return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special);
@@ -1087,8 +1036,6 @@ static void free_rq_clone(struct request *clone)
 	struct dm_rq_target_io *tio = clone->end_io_data;
 	struct mapped_device *md = tio->md;
 
-	blk_rq_unprep_clone(clone);
-
 	if (md->type == DM_TYPE_MQ_REQUEST_BASED)
 		/* stacked on blk-mq queue(s) */
 		tio->ti->type->release_clone_rq(clone);
@@ -1827,39 +1774,13 @@ static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
 		dm_complete_request(rq, r);
 }
 
-static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
-				 void *data)
+static void setup_clone(struct request *clone, struct request *rq,
+		        struct dm_rq_target_io *tio)
 {
-	struct dm_rq_target_io *tio = data;
-	struct dm_rq_clone_bio_info *info =
-		container_of(bio, struct dm_rq_clone_bio_info, clone);
-
-	info->orig = bio_orig;
-	info->tio = tio;
-	bio->bi_end_io = end_clone_bio;
-
-	return 0;
-}
-
-static int setup_clone(struct request *clone, struct request *rq,
-		       struct dm_rq_target_io *tio, gfp_t gfp_mask)
-{
-	int r;
-
-	r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask,
-			      dm_rq_bio_constructor, tio);
-	if (r)
-		return r;
-
-	clone->cmd = rq->cmd;
-	clone->cmd_len = rq->cmd_len;
-	clone->sense = rq->sense;
+	blk_rq_prep_clone(clone, rq);
 	clone->end_io = end_clone_request;
 	clone->end_io_data = tio;
-
 	tio->clone = clone;
-
-	return 0;
 }
 
 static struct request *clone_rq(struct request *rq, struct mapped_device *md,
@@ -1880,12 +1801,7 @@ static struct request *clone_rq(struct request *rq, struct mapped_device *md,
 		clone = tio->clone;
 
 	blk_rq_init(NULL, clone);
-	if (setup_clone(clone, rq, tio, gfp_mask)) {
-		/* -ENOMEM */
-		if (alloc_clone)
-			free_clone_request(md, clone);
-		return NULL;
-	}
+	setup_clone(clone, rq, tio);
 
 	return clone;
 }
@@ -1979,11 +1895,7 @@ static int map_request(struct dm_rq_target_io *tio, struct request *rq,
 		}
 		if (r != DM_MAPIO_REMAPPED)
 			return r;
-		if (setup_clone(clone, rq, tio, GFP_ATOMIC)) {
-			/* -ENOMEM */
-			ti->type->release_clone_rq(clone);
-			return DM_MAPIO_REQUEUE;
-		}
+		setup_clone(clone, rq, tio);
 	}
 
 	switch (r) {
@@ -2168,7 +2080,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
 			 * the query about congestion status of request_queue
 			 */
 			if (dm_request_based(md))
-				r = md->queue->backing_dev_info.state &
+				r = md->queue->backing_dev_info.wb.state &
 				    bdi_bits;
 			else
 				r = dm_table_any_congested(map, bdi_bits);
@@ -2437,8 +2349,6 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 		goto out;
 	}
 
-	BUG_ON(!p || md->io_pool || md->rq_pool || md->bs);
-
 	md->io_pool = p->io_pool;
 	p->io_pool = NULL;
 	md->rq_pool = p->rq_pool;
@@ -3544,48 +3454,23 @@ int dm_noflush_suspending(struct dm_target *ti)
 }
 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
 
-struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
-					    unsigned integrity, unsigned per_bio_data_size)
+struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity,
+					     unsigned per_bio_data_size)
 {
-	struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
-	struct kmem_cache *cachep = NULL;
-	unsigned int pool_size = 0;
+	struct dm_md_mempools *pools;
+	unsigned int pool_size = dm_get_reserved_bio_based_ios();
 	unsigned int front_pad;
 
+	pools = kzalloc(sizeof(*pools), GFP_KERNEL);
 	if (!pools)
 		return NULL;
 
-	type = filter_md_type(type, md);
+	front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) +
+		offsetof(struct dm_target_io, clone);
 
-	switch (type) {
-	case DM_TYPE_BIO_BASED:
-		cachep = _io_cache;
-		pool_size = dm_get_reserved_bio_based_ios();
-		front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
-		break;
-	case DM_TYPE_REQUEST_BASED:
-		cachep = _rq_tio_cache;
-		pool_size = dm_get_reserved_rq_based_ios();
-		pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
-		if (!pools->rq_pool)
-			goto out;
-		/* fall through to setup remaining rq-based pools */
-	case DM_TYPE_MQ_REQUEST_BASED:
-		if (!pool_size)
-			pool_size = dm_get_reserved_rq_based_ios();
-		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
-		/* per_bio_data_size is not used. See __bind_mempools(). */
-		WARN_ON(per_bio_data_size != 0);
-		break;
-	default:
-		BUG();
-	}
-
-	if (cachep) {
-		pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
-		if (!pools->io_pool)
-			goto out;
-	}
+	pools->io_pool = mempool_create_slab_pool(pool_size, _io_cache);
+	if (!pools->io_pool)
+		goto out;
 
 	pools->bs = bioset_create_nobvec(pool_size, front_pad);
 	if (!pools->bs)
@@ -3595,10 +3480,34 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t
 		goto out;
 
 	return pools;
-
 out:
 	dm_free_md_mempools(pools);
+	return NULL;
+}
+
+struct dm_md_mempools *dm_alloc_rq_mempools(struct mapped_device *md,
+					    unsigned type)
+{
+	unsigned int pool_size = dm_get_reserved_rq_based_ios();
+	struct dm_md_mempools *pools;
+
+	pools = kzalloc(sizeof(*pools), GFP_KERNEL);
+	if (!pools)
+		return NULL;
+
+	if (filter_md_type(type, md) == DM_TYPE_REQUEST_BASED) {
+		pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
+		if (!pools->rq_pool)
+			goto out;
+	}
 
+	pools->io_pool = mempool_create_slab_pool(pool_size, _rq_tio_cache);
+	if (!pools->io_pool)
+		goto out;
+
+	return pools;
+out:
+	dm_free_md_mempools(pools);
 	return NULL;
 }
 
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 6123c2bf9150cb..7fff744f0865b4 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -14,6 +14,7 @@
 #include <linux/device-mapper.h>
 #include <linux/list.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/hdreg.h>
 #include <linux/completion.h>
 #include <linux/kobject.h>
@@ -222,8 +223,9 @@ void dm_kcopyd_exit(void);
 /*
  * Mempool operations
  */
-struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
-					    unsigned integrity, unsigned per_bio_data_size);
+struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity,
+					     unsigned per_bio_data_size);
+struct dm_md_mempools *dm_alloc_rq_mempools(struct mapped_device *md, unsigned type);
 void dm_free_md_mempools(struct dm_md_mempools *pools);
 
 /*
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 4046a6c6f223b0..7da6e9c3cb53eb 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -16,6 +16,7 @@
 #define _MD_MD_H
 
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/kobject.h>
 #include <linux/list.h>
 #include <linux/mm.h>
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 9157a29c8dbf13..f80f1af61ce70b 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -745,7 +745,7 @@ static int raid1_congested(struct mddev *mddev, int bits)
 	struct r1conf *conf = mddev->private;
 	int i, ret = 0;
 
-	if ((bits & (1 << BDI_async_congested)) &&
+	if ((bits & (1 << WB_async_congested)) &&
 	    conf->pending_count >= max_queued_requests)
 		return 1;
 
@@ -760,7 +760,7 @@ static int raid1_congested(struct mddev *mddev, int bits)
 			/* Note the '|| 1' - when read_balance prefers
 			 * non-congested targets, it can be removed
 			 */
-			if ((bits & (1<<BDI_async_congested)) || 1)
+			if ((bits & (1 << WB_async_congested)) || 1)
 				ret |= bdi_congested(&q->backing_dev_info, bits);
 			else
 				ret &= bdi_congested(&q->backing_dev_info, bits);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index e793ab6b35705e..fca825718f29a2 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -914,7 +914,7 @@ static int raid10_congested(struct mddev *mddev, int bits)
 	struct r10conf *conf = mddev->private;
 	int i, ret = 0;
 
-	if ((bits & (1 << BDI_async_congested)) &&
+	if ((bits & (1 << WB_async_congested)) &&
 	    conf->pending_count >= max_queued_requests)
 		return 1;
 
diff --git a/drivers/mmc/host/android-goldfish.c b/drivers/mmc/host/android-goldfish.c
index 8b4e20a3f16cd4..b1eac719a4cca2 100644
--- a/drivers/mmc/host/android-goldfish.c
+++ b/drivers/mmc/host/android-goldfish.c
@@ -42,10 +42,10 @@
 #include <linux/spinlock.h>
 #include <linux/timer.h>
 #include <linux/clk.h>
+#include <linux/scatterlist.h>
 
 #include <asm/io.h>
 #include <asm/irq.h>
-#include <asm/scatterlist.h>
 
 #include <asm/types.h>
 #include <asm/io.h>
diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c
index b16f3cda97ff0a..e2c0057737e674 100644
--- a/drivers/mtd/devices/block2mtd.c
+++ b/drivers/mtd/devices/block2mtd.c
@@ -20,6 +20,7 @@
 #include <linux/delay.h>
 #include <linux/fs.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/bio.h>
 #include <linux/pagemap.h>
 #include <linux/list.h>
diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c
index 90f39f79f5d75a..ef1d9fb06cab28 100644
--- a/drivers/s390/block/dasd_genhd.c
+++ b/drivers/s390/block/dasd_genhd.c
@@ -99,9 +99,8 @@ void dasd_gendisk_free(struct dasd_block *block)
 int dasd_scan_partitions(struct dasd_block *block)
 {
 	struct block_device *bdev;
-	int retry, rc;
+	int rc;
 
-	retry = 5;
 	bdev = bdget_disk(block->gdp, 0);
 	if (!bdev) {
 		DBF_DEV_EVENT(DBF_ERR, block->base, "%s",
@@ -116,19 +115,11 @@ int dasd_scan_partitions(struct dasd_block *block)
 			      rc);
 		return -ENODEV;
 	}
-	/*
-	 * See fs/partition/check.c:register_disk,rescan_partitions
-	 * Can't call rescan_partitions directly. Use ioctl.
-	 */
-	rc = ioctl_by_bdev(bdev, BLKRRPART, 0);
-	while (rc == -EBUSY && retry > 0) {
-		schedule();
-		rc = ioctl_by_bdev(bdev, BLKRRPART, 0);
-		retry--;
+
+	rc = blkdev_reread_part(bdev);
+	if (rc)
 		DBF_DEV_EVENT(DBF_ERR, block->base,
-			      "scan partitions error, retry %d rc %d",
-			      retry, rc);
-	}
+				"scan partitions error, rc %d", rc);
 
 	/*
 	 * Since the matching blkdev_put call to the blkdev_get in
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
index d72605864b0a66..14562788e4e059 100644
--- a/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
@@ -55,9 +55,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
 	if (PagePrivate(page))
 		page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
 
-	if (TestClearPageDirty(page))
-		account_page_cleaned(page, mapping);
-
+	cancel_dirty_page(page);
 	ClearPageMappedToDisk(page);
 	ll_delete_from_page_cache(page);
 }
diff --git a/fs/block_dev.c b/fs/block_dev.c
index c7e4163ede87f3..f04c873a7365c1 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -14,6 +14,7 @@
 #include <linux/device_cgroup.h>
 #include <linux/highmem.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/module.h>
 #include <linux/blkpg.h>
 #include <linux/magic.h>
@@ -546,7 +547,8 @@ static struct file_system_type bd_type = {
 	.kill_sb	= kill_anon_super,
 };
 
-static struct super_block *blockdev_superblock __read_mostly;
+struct super_block *blockdev_superblock __read_mostly;
+EXPORT_SYMBOL_GPL(blockdev_superblock);
 
 void __init bdev_cache_init(void)
 {
@@ -687,11 +689,6 @@ static struct block_device *bd_acquire(struct inode *inode)
 	return bdev;
 }
 
-int sb_is_blkdev_sb(struct super_block *sb)
-{
-	return sb == blockdev_superblock;
-}
-
 /* Call when you free inode */
 
 void bd_forget(struct inode *inode)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2ef9a4b72d06e4..0bccf18dc1dca0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1745,7 +1745,7 @@ static void end_workqueue_fn(struct btrfs_work *work)
 	bio->bi_private = end_io_wq->private;
 	bio->bi_end_io = end_io_wq->end_io;
 	kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
-	bio_endio_nodec(bio, error);
+	bio_endio(bio, error);
 }
 
 static int cleaner_kthread(void *arg)
@@ -3269,11 +3269,8 @@ static int write_dev_supers(struct btrfs_device *device,
  */
 static void btrfs_end_empty_barrier(struct bio *bio, int err)
 {
-	if (err) {
-		if (err == -EOPNOTSUPP)
-			set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+	if (err)
 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
-	}
 	if (bio->bi_private)
 		complete(bio->bi_private);
 	bio_put(bio);
@@ -3301,11 +3298,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
 
 		wait_for_completion(&device->flush_wait);
 
-		if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
-			printk_in_rcu("BTRFS: disabling barriers on dev %s\n",
-				      rcu_str_deref(device->name));
-			device->nobarriers = 1;
-		} else if (!bio_flagged(bio, BIO_UPTODATE)) {
+		if (!bio_flagged(bio, BIO_UPTODATE)) {
 			ret = -EIO;
 			btrfs_dev_stat_inc_and_print(device,
 				BTRFS_DEV_STAT_FLUSH_ERRS);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c32d226bfeccbb..c374e1e71e5f3e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2767,8 +2767,6 @@ static int __must_check submit_one_bio(int rw, struct bio *bio,
 	else
 		btrfsic_submit_bio(rw, bio);
 
-	if (bio_flagged(bio, BIO_EOPNOTSUPP))
-		ret = -EOPNOTSUPP;
 	bio_put(bio);
 	return ret;
 }
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 174f5e1e00abfa..53af23f2c087ad 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -345,7 +345,7 @@ loop_lock:
 		    waitqueue_active(&fs_info->async_submit_wait))
 			wake_up(&fs_info->async_submit_wait);
 
-		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
+		BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
 
 		/*
 		 * if we're doing the sync list, record that our
@@ -5586,10 +5586,10 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 
 static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err)
 {
-	if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED))
-		bio_endio_nodec(bio, err);
-	else
-		bio_endio(bio, err);
+	bio->bi_private = bbio->private;
+	bio->bi_end_io = bbio->end_io;
+	bio_endio(bio, err);
+
 	btrfs_put_bbio(bbio);
 }
 
@@ -5633,8 +5633,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
 			bio = bbio->orig_bio;
 		}
 
-		bio->bi_private = bbio->private;
-		bio->bi_end_io = bbio->end_io;
 		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
 		/* only send an error to the higher layers if it is
 		 * beyond the tolerance of the btrfs bio
@@ -5816,8 +5814,6 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
 		/* Shoud be the original bio. */
 		WARN_ON(bio != bbio->orig_bio);
 
-		bio->bi_private = bbio->private;
-		bio->bi_end_io = bbio->end_io;
 		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
 		bio->bi_iter.bi_sector = logical >> 9;
 
@@ -5898,10 +5894,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		if (dev_nr < total_devs - 1) {
 			bio = btrfs_bio_clone(first_bio, GFP_NOFS);
 			BUG_ON(!bio); /* -ENOMEM */
-		} else {
+		} else
 			bio = first_bio;
-			bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED;
-		}
 
 		submit_stripe_bio(root, bbio, bio,
 				  bbio->stripes[dev_nr].physical, dev_nr, rw,
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index ebc31331a83746..cedae0356558d9 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -292,8 +292,6 @@ struct btrfs_bio_stripe {
 struct btrfs_bio;
 typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
 
-#define BTRFS_BIO_ORIG_BIO_SUBMITTED	(1 << 0)
-
 struct btrfs_bio {
 	atomic_t refs;
 	atomic_t stripes_pending;
diff --git a/fs/buffer.c b/fs/buffer.c
index c7a5602d01eed2..1cf7a53a02771e 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -30,6 +30,7 @@
 #include <linux/quotaops.h>
 #include <linux/highmem.h>
 #include <linux/export.h>
+#include <linux/backing-dev.h>
 #include <linux/writeback.h>
 #include <linux/hash.h>
 #include <linux/suspend.h>
@@ -44,6 +45,9 @@
 #include <trace/events/block.h>
 
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
+static int submit_bh_wbc(int rw, struct buffer_head *bh,
+			 unsigned long bio_flags,
+			 struct writeback_control *wbc);
 
 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
 
@@ -623,21 +627,22 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
  *
  * If warn is true, then emit a warning if the page is not uptodate and has
  * not been truncated.
+ *
+ * The caller must hold mem_cgroup_begin_page_stat() lock.
  */
-static void __set_page_dirty(struct page *page,
-		struct address_space *mapping, int warn)
+static void __set_page_dirty(struct page *page, struct address_space *mapping,
+			     struct mem_cgroup *memcg, int warn)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&mapping->tree_lock, flags);
 	if (page->mapping) {	/* Race with truncate? */
 		WARN_ON_ONCE(warn && !PageUptodate(page));
-		account_page_dirtied(page, mapping);
+		account_page_dirtied(page, mapping, memcg);
 		radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
 	}
 	spin_unlock_irqrestore(&mapping->tree_lock, flags);
-	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 }
 
 /*
@@ -668,6 +673,7 @@ static void __set_page_dirty(struct page *page,
 int __set_page_dirty_buffers(struct page *page)
 {
 	int newly_dirty;
+	struct mem_cgroup *memcg;
 	struct address_space *mapping = page_mapping(page);
 
 	if (unlikely(!mapping))
@@ -683,11 +689,22 @@ int __set_page_dirty_buffers(struct page *page)
 			bh = bh->b_this_page;
 		} while (bh != head);
 	}
+	/*
+	 * Use mem_group_begin_page_stat() to keep PageDirty synchronized with
+	 * per-memcg dirty page counters.
+	 */
+	memcg = mem_cgroup_begin_page_stat(page);
 	newly_dirty = !TestSetPageDirty(page);
 	spin_unlock(&mapping->private_lock);
 
 	if (newly_dirty)
-		__set_page_dirty(page, mapping, 1);
+		__set_page_dirty(page, mapping, memcg, 1);
+
+	mem_cgroup_end_page_stat(memcg);
+
+	if (newly_dirty)
+		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+
 	return newly_dirty;
 }
 EXPORT_SYMBOL(__set_page_dirty_buffers);
@@ -1158,11 +1175,18 @@ void mark_buffer_dirty(struct buffer_head *bh)
 
 	if (!test_set_buffer_dirty(bh)) {
 		struct page *page = bh->b_page;
+		struct address_space *mapping = NULL;
+		struct mem_cgroup *memcg;
+
+		memcg = mem_cgroup_begin_page_stat(page);
 		if (!TestSetPageDirty(page)) {
-			struct address_space *mapping = page_mapping(page);
+			mapping = page_mapping(page);
 			if (mapping)
-				__set_page_dirty(page, mapping, 0);
+				__set_page_dirty(page, mapping, memcg, 0);
 		}
+		mem_cgroup_end_page_stat(memcg);
+		if (mapping)
+			__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 	}
 }
 EXPORT_SYMBOL(mark_buffer_dirty);
@@ -1684,8 +1708,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 	struct buffer_head *bh, *head;
 	unsigned int blocksize, bbits;
 	int nr_underway = 0;
-	int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
-			WRITE_SYNC : WRITE);
+	int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
 
 	head = create_page_buffers(page, inode,
 					(1 << BH_Dirty)|(1 << BH_Uptodate));
@@ -1774,7 +1797,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 	do {
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
-			submit_bh(write_op, bh);
+			submit_bh_wbc(write_op, bh, 0, wbc);
 			nr_underway++;
 		}
 		bh = next;
@@ -1828,7 +1851,7 @@ recover:
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
 			clear_buffer_dirty(bh);
-			submit_bh(write_op, bh);
+			submit_bh_wbc(write_op, bh, 0, wbc);
 			nr_underway++;
 		}
 		bh = next;
@@ -2938,10 +2961,6 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
 {
 	struct buffer_head *bh = bio->bi_private;
 
-	if (err == -EOPNOTSUPP) {
-		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
-	}
-
 	if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
 		set_bit(BH_Quiet, &bh->b_state);
 
@@ -2997,10 +3016,10 @@ void guard_bio_eod(int rw, struct bio *bio)
 	}
 }
 
-int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
+static int submit_bh_wbc(int rw, struct buffer_head *bh,
+			 unsigned long bio_flags, struct writeback_control *wbc)
 {
 	struct bio *bio;
-	int ret = 0;
 
 	BUG_ON(!buffer_locked(bh));
 	BUG_ON(!buffer_mapped(bh));
@@ -3020,6 +3039,11 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
 	 */
 	bio = bio_alloc(GFP_NOIO, 1);
 
+	if (wbc) {
+		wbc_init_bio(wbc, bio);
+		wbc_account_io(wbc, bh->b_page, bh->b_size);
+	}
+
 	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 	bio->bi_bdev = bh->b_bdev;
 	bio->bi_io_vec[0].bv_page = bh->b_page;
@@ -3041,20 +3065,19 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
 	if (buffer_prio(bh))
 		rw |= REQ_PRIO;
 
-	bio_get(bio);
 	submit_bio(rw, bio);
+	return 0;
+}
 
-	if (bio_flagged(bio, BIO_EOPNOTSUPP))
-		ret = -EOPNOTSUPP;
-
-	bio_put(bio);
-	return ret;
+int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
+{
+	return submit_bh_wbc(rw, bh, bio_flags, NULL);
 }
 EXPORT_SYMBOL_GPL(_submit_bh);
 
 int submit_bh(int rw, struct buffer_head *bh)
 {
-	return _submit_bh(rw, bh, 0);
+	return submit_bh_wbc(rw, bh, 0, NULL);
 }
 EXPORT_SYMBOL(submit_bh);
 
@@ -3243,8 +3266,8 @@ int try_to_free_buffers(struct page *page)
 	 * to synchronise against __set_page_dirty_buffers and prevent the
 	 * dirty bit from being lost.
 	 */
-	if (ret && TestClearPageDirty(page))
-		account_page_cleaned(page, mapping);
+	if (ret)
+		cancel_dirty_page(page);
 	spin_unlock(&mapping->private_lock);
 out:
 	if (buffers_to_free) {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index d0e746e965118f..549219d44f1c12 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1543,7 +1543,7 @@ static struct file_system_type ext2_fs_type = {
 	.name		= "ext2",
 	.mount		= ext2_mount,
 	.kill_sb	= kill_block_super,
-	.fs_flags	= FS_REQUIRES_DEV,
+	.fs_flags	= FS_REQUIRES_DEV | FS_CGROUP_WRITEBACK,
 };
 MODULE_ALIAS_FS("ext2");
 
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e003a1e81dc351..79acfbf6bc8aea 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -39,6 +39,7 @@
 #include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <linux/fiemap.h>
+#include <linux/backing-dev.h>
 #include "ext4_jbd2.h"
 #include "ext4_extents.h"
 #include "xattr.h"
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 8d1e60214ef0a8..440987c8ba9ef2 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -26,6 +26,7 @@
 #include <linux/log2.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/backing-dev.h>
 #include <trace/events/ext4.h>
 
 #ifdef CONFIG_EXT4_DEBUG
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 79636e21d3a2d2..5602450f03f649 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -359,7 +359,6 @@ void ext4_io_submit(struct ext4_io_submit *io)
 	if (bio) {
 		bio_get(io->io_bio);
 		submit_bio(io->io_op, io->io_bio);
-		BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP));
 		bio_put(io->io_bio);
 	}
 	io->io_bio = NULL;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index b0bd1c1061b3dd..3625a8cc0f5a98 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -24,6 +24,7 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/parser.h>
 #include <linux/buffer_head.h>
 #include <linux/exportfs.h>
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index d9c52424bac215..7dd63b794bfb5a 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -53,7 +53,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
 							PAGE_CACHE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
 	} else if (type == DIRTY_DENTS) {
-		if (sbi->sb->s_bdi->dirty_exceeded)
+		if (sbi->sb->s_bdi->wb.dirty_exceeded)
 			return false;
 		mem_size = get_pages(sbi, F2FS_DIRTY_DENTS);
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
@@ -70,7 +70,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
 				sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
 	} else {
-		if (sbi->sb->s_bdi->dirty_exceeded)
+		if (sbi->sb->s_bdi->wb.dirty_exceeded)
 			return false;
 	}
 	return res;
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 84963577811881..79e7b879a75321 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -9,6 +9,7 @@
  * published by the Free Software Foundation.
  */
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 
 /* constant macro */
 #define NULL_SEGNO			((unsigned int)(~0))
@@ -714,7 +715,7 @@ static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi)
  */
 static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
 {
-	if (sbi->sb->s_bdi->dirty_exceeded)
+	if (sbi->sb->s_bdi->wb.dirty_exceeded)
 		return 0;
 
 	if (type == DATA)
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 442d50a0e33e6a..a08f1039909a76 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -11,6 +11,7 @@
 #include <linux/compat.h>
 #include <linux/mount.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/fsnotify.h>
 #include <linux/security.h>
 #include "fat.h"
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index c06774658345ef..509411dd369895 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -18,6 +18,7 @@
 #include <linux/parser.h>
 #include <linux/uio.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <asm/unaligned.h>
 #include "fat.h"
 
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 32a8bbd7a9ad11..f60de54d204284 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -27,6 +27,7 @@
 #include <linux/backing-dev.h>
 #include <linux/tracepoint.h>
 #include <linux/device.h>
+#include <linux/memcontrol.h>
 #include "internal.h"
 
 /*
@@ -34,6 +35,10 @@
  */
 #define MIN_WRITEBACK_PAGES	(4096UL >> (PAGE_CACHE_SHIFT - 10))
 
+struct wb_completion {
+	atomic_t		cnt;
+};
+
 /*
  * Passed into wb_writeback(), essentially a subset of writeback_control
  */
@@ -47,13 +52,29 @@ struct wb_writeback_work {
 	unsigned int range_cyclic:1;
 	unsigned int for_background:1;
 	unsigned int for_sync:1;	/* sync(2) WB_SYNC_ALL writeback */
+	unsigned int auto_free:1;	/* free on completion */
+	unsigned int single_wait:1;
+	unsigned int single_done:1;
 	enum wb_reason reason;		/* why was writeback initiated? */
 
 	struct list_head list;		/* pending work list */
-	struct completion *done;	/* set if the caller waits */
+	struct wb_completion *done;	/* set if the caller waits */
 };
 
 /*
+ * If one wants to wait for one or more wb_writeback_works, each work's
+ * ->done should be set to a wb_completion defined using the following
+ * macro.  Once all work items are issued with wb_queue_work(), the caller
+ * can wait for the completion of all using wb_wait_for_completion().  Work
+ * items which are waited upon aren't freed automatically on completion.
+ */
+#define DEFINE_WB_COMPLETION_ONSTACK(cmpl)				\
+	struct wb_completion cmpl = {					\
+		.cnt		= ATOMIC_INIT(1),			\
+	}
+
+
+/*
  * If an inode is constantly having its pages dirtied, but then the
  * updates stop dirtytime_expire_interval seconds in the past, it's
  * possible for the worst case time between when an inode has its
@@ -65,35 +86,6 @@ struct wb_writeback_work {
  */
 unsigned int dirtytime_expire_interval = 12 * 60 * 60;
 
-/**
- * writeback_in_progress - determine whether there is writeback in progress
- * @bdi: the device's backing_dev_info structure.
- *
- * Determine whether there is writeback waiting to be handled against a
- * backing device.
- */
-int writeback_in_progress(struct backing_dev_info *bdi)
-{
-	return test_bit(BDI_writeback_running, &bdi->state);
-}
-EXPORT_SYMBOL(writeback_in_progress);
-
-struct backing_dev_info *inode_to_bdi(struct inode *inode)
-{
-	struct super_block *sb;
-
-	if (!inode)
-		return &noop_backing_dev_info;
-
-	sb = inode->i_sb;
-#ifdef CONFIG_BLOCK
-	if (sb_is_blkdev_sb(sb))
-		return blk_get_backing_dev_info(I_BDEV(inode));
-#endif
-	return sb->s_bdi;
-}
-EXPORT_SYMBOL_GPL(inode_to_bdi);
-
 static inline struct inode *wb_inode(struct list_head *head)
 {
 	return list_entry(head, struct inode, i_wb_list);
@@ -109,45 +101,820 @@ static inline struct inode *wb_inode(struct list_head *head)
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
 
-static void bdi_wakeup_thread(struct backing_dev_info *bdi)
+static bool wb_io_lists_populated(struct bdi_writeback *wb)
 {
-	spin_lock_bh(&bdi->wb_lock);
-	if (test_bit(BDI_registered, &bdi->state))
-		mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
-	spin_unlock_bh(&bdi->wb_lock);
+	if (wb_has_dirty_io(wb)) {
+		return false;
+	} else {
+		set_bit(WB_has_dirty_io, &wb->state);
+		WARN_ON_ONCE(!wb->avg_write_bandwidth);
+		atomic_long_add(wb->avg_write_bandwidth,
+				&wb->bdi->tot_write_bandwidth);
+		return true;
+	}
 }
 
-static void bdi_queue_work(struct backing_dev_info *bdi,
-			   struct wb_writeback_work *work)
+static void wb_io_lists_depopulated(struct bdi_writeback *wb)
 {
-	trace_writeback_queue(bdi, work);
+	if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) &&
+	    list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) {
+		clear_bit(WB_has_dirty_io, &wb->state);
+		WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth,
+					&wb->bdi->tot_write_bandwidth) < 0);
+	}
+}
+
+/**
+ * inode_wb_list_move_locked - move an inode onto a bdi_writeback IO list
+ * @inode: inode to be moved
+ * @wb: target bdi_writeback
+ * @head: one of @wb->b_{dirty|io|more_io}
+ *
+ * Move @inode->i_wb_list to @list of @wb and set %WB_has_dirty_io.
+ * Returns %true if @inode is the first occupant of the !dirty_time IO
+ * lists; otherwise, %false.
+ */
+static bool inode_wb_list_move_locked(struct inode *inode,
+				      struct bdi_writeback *wb,
+				      struct list_head *head)
+{
+	assert_spin_locked(&wb->list_lock);
+
+	list_move(&inode->i_wb_list, head);
+
+	/* dirty_time doesn't count as dirty_io until expiration */
+	if (head != &wb->b_dirty_time)
+		return wb_io_lists_populated(wb);
+
+	wb_io_lists_depopulated(wb);
+	return false;
+}
+
+/**
+ * inode_wb_list_del_locked - remove an inode from its bdi_writeback IO list
+ * @inode: inode to be removed
+ * @wb: bdi_writeback @inode is being removed from
+ *
+ * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and
+ * clear %WB_has_dirty_io if all are empty afterwards.
+ */
+static void inode_wb_list_del_locked(struct inode *inode,
+				     struct bdi_writeback *wb)
+{
+	assert_spin_locked(&wb->list_lock);
+
+	list_del_init(&inode->i_wb_list);
+	wb_io_lists_depopulated(wb);
+}
+
+static void wb_wakeup(struct bdi_writeback *wb)
+{
+	spin_lock_bh(&wb->work_lock);
+	if (test_bit(WB_registered, &wb->state))
+		mod_delayed_work(bdi_wq, &wb->dwork, 0);
+	spin_unlock_bh(&wb->work_lock);
+}
 
-	spin_lock_bh(&bdi->wb_lock);
-	if (!test_bit(BDI_registered, &bdi->state)) {
-		if (work->done)
-			complete(work->done);
+static void wb_queue_work(struct bdi_writeback *wb,
+			  struct wb_writeback_work *work)
+{
+	trace_writeback_queue(wb->bdi, work);
+
+	spin_lock_bh(&wb->work_lock);
+	if (!test_bit(WB_registered, &wb->state)) {
+		if (work->single_wait)
+			work->single_done = 1;
 		goto out_unlock;
 	}
-	list_add_tail(&work->list, &bdi->work_list);
-	mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
+	if (work->done)
+		atomic_inc(&work->done->cnt);
+	list_add_tail(&work->list, &wb->work_list);
+	mod_delayed_work(bdi_wq, &wb->dwork, 0);
 out_unlock:
-	spin_unlock_bh(&bdi->wb_lock);
+	spin_unlock_bh(&wb->work_lock);
+}
+
+/**
+ * wb_wait_for_completion - wait for completion of bdi_writeback_works
+ * @bdi: bdi work items were issued to
+ * @done: target wb_completion
+ *
+ * Wait for one or more work items issued to @bdi with their ->done field
+ * set to @done, which should have been defined with
+ * DEFINE_WB_COMPLETION_ONSTACK().  This function returns after all such
+ * work items are completed.  Work items which are waited upon aren't freed
+ * automatically on completion.
+ */
+static void wb_wait_for_completion(struct backing_dev_info *bdi,
+				   struct wb_completion *done)
+{
+	atomic_dec(&done->cnt);		/* put down the initial count */
+	wait_event(bdi->wb_waitq, !atomic_read(&done->cnt));
+}
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+/* parameters for foreign inode detection, see wb_detach_inode() */
+#define WB_FRN_TIME_SHIFT	13	/* 1s = 2^13, upto 8 secs w/ 16bit */
+#define WB_FRN_TIME_AVG_SHIFT	3	/* avg = avg * 7/8 + new * 1/8 */
+#define WB_FRN_TIME_CUT_DIV	2	/* ignore rounds < avg / 2 */
+#define WB_FRN_TIME_PERIOD	(2 * (1 << WB_FRN_TIME_SHIFT))	/* 2s */
+
+#define WB_FRN_HIST_SLOTS	16	/* inode->i_wb_frn_history is 16bit */
+#define WB_FRN_HIST_UNIT	(WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
+					/* each slot's duration is 2s / 16 */
+#define WB_FRN_HIST_THR_SLOTS	(WB_FRN_HIST_SLOTS / 2)
+					/* if foreign slots >= 8, switch */
+#define WB_FRN_HIST_MAX_SLOTS	(WB_FRN_HIST_THR_SLOTS / 2 + 1)
+					/* one round can affect upto 5 slots */
+
+void __inode_attach_wb(struct inode *inode, struct page *page)
+{
+	struct backing_dev_info *bdi = inode_to_bdi(inode);
+	struct bdi_writeback *wb = NULL;
+
+	if (inode_cgwb_enabled(inode)) {
+		struct cgroup_subsys_state *memcg_css;
+
+		if (page) {
+			memcg_css = mem_cgroup_css_from_page(page);
+			wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+		} else {
+			/* must pin memcg_css, see wb_get_create() */
+			memcg_css = task_get_css(current, memory_cgrp_id);
+			wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+			css_put(memcg_css);
+		}
+	}
+
+	if (!wb)
+		wb = &bdi->wb;
+
+	/*
+	 * There may be multiple instances of this function racing to
+	 * update the same inode.  Use cmpxchg() to tell the winner.
+	 */
+	if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
+		wb_put(wb);
+}
+
+/**
+ * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
+ * @inode: inode of interest with i_lock held
+ *
+ * Returns @inode's wb with its list_lock held.  @inode->i_lock must be
+ * held on entry and is released on return.  The returned wb is guaranteed
+ * to stay @inode's associated wb until its list_lock is released.
+ */
+static struct bdi_writeback *
+locked_inode_to_wb_and_lock_list(struct inode *inode)
+	__releases(&inode->i_lock)
+	__acquires(&wb->list_lock)
+{
+	while (true) {
+		struct bdi_writeback *wb = inode_to_wb(inode);
+
+		/*
+		 * inode_to_wb() association is protected by both
+		 * @inode->i_lock and @wb->list_lock but list_lock nests
+		 * outside i_lock.  Drop i_lock and verify that the
+		 * association hasn't changed after acquiring list_lock.
+		 */
+		wb_get(wb);
+		spin_unlock(&inode->i_lock);
+		spin_lock(&wb->list_lock);
+		wb_put(wb);		/* not gonna deref it anymore */
+
+		/* i_wb may have changed inbetween, can't use inode_to_wb() */
+		if (likely(wb == inode->i_wb))
+			return wb;	/* @inode already has ref */
+
+		spin_unlock(&wb->list_lock);
+		cpu_relax();
+		spin_lock(&inode->i_lock);
+	}
+}
+
+/**
+ * inode_to_wb_and_lock_list - determine an inode's wb and lock it
+ * @inode: inode of interest
+ *
+ * Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held
+ * on entry.
+ */
+static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
+	__acquires(&wb->list_lock)
+{
+	spin_lock(&inode->i_lock);
+	return locked_inode_to_wb_and_lock_list(inode);
+}
+
+struct inode_switch_wbs_context {
+	struct inode		*inode;
+	struct bdi_writeback	*new_wb;
+
+	struct rcu_head		rcu_head;
+	struct work_struct	work;
+};
+
+static void inode_switch_wbs_work_fn(struct work_struct *work)
+{
+	struct inode_switch_wbs_context *isw =
+		container_of(work, struct inode_switch_wbs_context, work);
+	struct inode *inode = isw->inode;
+	struct address_space *mapping = inode->i_mapping;
+	struct bdi_writeback *old_wb = inode->i_wb;
+	struct bdi_writeback *new_wb = isw->new_wb;
+	struct radix_tree_iter iter;
+	bool switched = false;
+	void **slot;
+
+	/*
+	 * By the time control reaches here, RCU grace period has passed
+	 * since I_WB_SWITCH assertion and all wb stat update transactions
+	 * between unlocked_inode_to_wb_begin/end() are guaranteed to be
+	 * synchronizing against mapping->tree_lock.
+	 *
+	 * Grabbing old_wb->list_lock, inode->i_lock and mapping->tree_lock
+	 * gives us exclusion against all wb related operations on @inode
+	 * including IO list manipulations and stat updates.
+	 */
+	if (old_wb < new_wb) {
+		spin_lock(&old_wb->list_lock);
+		spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
+	} else {
+		spin_lock(&new_wb->list_lock);
+		spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
+	}
+	spin_lock(&inode->i_lock);
+	spin_lock_irq(&mapping->tree_lock);
+
+	/*
+	 * Once I_FREEING is visible under i_lock, the eviction path owns
+	 * the inode and we shouldn't modify ->i_wb_list.
+	 */
+	if (unlikely(inode->i_state & I_FREEING))
+		goto skip_switch;
+
+	/*
+	 * Count and transfer stats.  Note that PAGECACHE_TAG_DIRTY points
+	 * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to
+	 * pages actually under underwriteback.
+	 */
+	radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
+				   PAGECACHE_TAG_DIRTY) {
+		struct page *page = radix_tree_deref_slot_protected(slot,
+							&mapping->tree_lock);
+		if (likely(page) && PageDirty(page)) {
+			__dec_wb_stat(old_wb, WB_RECLAIMABLE);
+			__inc_wb_stat(new_wb, WB_RECLAIMABLE);
+		}
+	}
+
+	radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
+				   PAGECACHE_TAG_WRITEBACK) {
+		struct page *page = radix_tree_deref_slot_protected(slot,
+							&mapping->tree_lock);
+		if (likely(page)) {
+			WARN_ON_ONCE(!PageWriteback(page));
+			__dec_wb_stat(old_wb, WB_WRITEBACK);
+			__inc_wb_stat(new_wb, WB_WRITEBACK);
+		}
+	}
+
+	wb_get(new_wb);
+
+	/*
+	 * Transfer to @new_wb's IO list if necessary.  The specific list
+	 * @inode was on is ignored and the inode is put on ->b_dirty which
+	 * is always correct including from ->b_dirty_time.  The transfer
+	 * preserves @inode->dirtied_when ordering.
+	 */
+	if (!list_empty(&inode->i_wb_list)) {
+		struct inode *pos;
+
+		inode_wb_list_del_locked(inode, old_wb);
+		inode->i_wb = new_wb;
+		list_for_each_entry(pos, &new_wb->b_dirty, i_wb_list)
+			if (time_after_eq(inode->dirtied_when,
+					  pos->dirtied_when))
+				break;
+		inode_wb_list_move_locked(inode, new_wb, pos->i_wb_list.prev);
+	} else {
+		inode->i_wb = new_wb;
+	}
+
+	/* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */
+	inode->i_wb_frn_winner = 0;
+	inode->i_wb_frn_avg_time = 0;
+	inode->i_wb_frn_history = 0;
+	switched = true;
+skip_switch:
+	/*
+	 * Paired with load_acquire in unlocked_inode_to_wb_begin() and
+	 * ensures that the new wb is visible if they see !I_WB_SWITCH.
+	 */
+	smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
+
+	spin_unlock_irq(&mapping->tree_lock);
+	spin_unlock(&inode->i_lock);
+	spin_unlock(&new_wb->list_lock);
+	spin_unlock(&old_wb->list_lock);
+
+	if (switched) {
+		wb_wakeup(new_wb);
+		wb_put(old_wb);
+	}
+	wb_put(new_wb);
+
+	iput(inode);
+	kfree(isw);
+}
+
+static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
+{
+	struct inode_switch_wbs_context *isw = container_of(rcu_head,
+				struct inode_switch_wbs_context, rcu_head);
+
+	/* needs to grab bh-unsafe locks, bounce to work item */
+	INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
+	schedule_work(&isw->work);
 }
 
-static void
-__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
-		      bool range_cyclic, enum wb_reason reason)
+/**
+ * inode_switch_wbs - change the wb association of an inode
+ * @inode: target inode
+ * @new_wb_id: ID of the new wb
+ *
+ * Switch @inode's wb association to the wb identified by @new_wb_id.  The
+ * switching is performed asynchronously and may fail silently.
+ */
+static void inode_switch_wbs(struct inode *inode, int new_wb_id)
+{
+	struct backing_dev_info *bdi = inode_to_bdi(inode);
+	struct cgroup_subsys_state *memcg_css;
+	struct inode_switch_wbs_context *isw;
+
+	/* noop if seems to be already in progress */
+	if (inode->i_state & I_WB_SWITCH)
+		return;
+
+	isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
+	if (!isw)
+		return;
+
+	/* find and pin the new wb */
+	rcu_read_lock();
+	memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
+	if (memcg_css)
+		isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+	rcu_read_unlock();
+	if (!isw->new_wb)
+		goto out_free;
+
+	/* while holding I_WB_SWITCH, no one else can update the association */
+	spin_lock(&inode->i_lock);
+	if (inode->i_state & (I_WB_SWITCH | I_FREEING) ||
+	    inode_to_wb(inode) == isw->new_wb) {
+		spin_unlock(&inode->i_lock);
+		goto out_free;
+	}
+	inode->i_state |= I_WB_SWITCH;
+	spin_unlock(&inode->i_lock);
+
+	ihold(inode);
+	isw->inode = inode;
+
+	/*
+	 * In addition to synchronizing among switchers, I_WB_SWITCH tells
+	 * the RCU protected stat update paths to grab the mapping's
+	 * tree_lock so that stat transfer can synchronize against them.
+	 * Let's continue after I_WB_SWITCH is guaranteed to be visible.
+	 */
+	call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
+	return;
+
+out_free:
+	if (isw->new_wb)
+		wb_put(isw->new_wb);
+	kfree(isw);
+}
+
+/**
+ * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it
+ * @wbc: writeback_control of interest
+ * @inode: target inode
+ *
+ * @inode is locked and about to be written back under the control of @wbc.
+ * Record @inode's writeback context into @wbc and unlock the i_lock.  On
+ * writeback completion, wbc_detach_inode() should be called.  This is used
+ * to track the cgroup writeback context.
+ */
+void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
+				 struct inode *inode)
+{
+	wbc->wb = inode_to_wb(inode);
+	wbc->inode = inode;
+
+	wbc->wb_id = wbc->wb->memcg_css->id;
+	wbc->wb_lcand_id = inode->i_wb_frn_winner;
+	wbc->wb_tcand_id = 0;
+	wbc->wb_bytes = 0;
+	wbc->wb_lcand_bytes = 0;
+	wbc->wb_tcand_bytes = 0;
+
+	wb_get(wbc->wb);
+	spin_unlock(&inode->i_lock);
+
+	/*
+	 * A dying wb indicates that the memcg-blkcg mapping has changed
+	 * and a new wb is already serving the memcg.  Switch immediately.
+	 */
+	if (unlikely(wb_dying(wbc->wb)))
+		inode_switch_wbs(inode, wbc->wb_id);
+}
+
+/**
+ * wbc_detach_inode - disassociate wbc from inode and perform foreign detection
+ * @wbc: writeback_control of the just finished writeback
+ *
+ * To be called after a writeback attempt of an inode finishes and undoes
+ * wbc_attach_and_unlock_inode().  Can be called under any context.
+ *
+ * As concurrent write sharing of an inode is expected to be very rare and
+ * memcg only tracks page ownership on first-use basis severely confining
+ * the usefulness of such sharing, cgroup writeback tracks ownership
+ * per-inode.  While the support for concurrent write sharing of an inode
+ * is deemed unnecessary, an inode being written to by different cgroups at
+ * different points in time is a lot more common, and, more importantly,
+ * charging only by first-use can too readily lead to grossly incorrect
+ * behaviors (single foreign page can lead to gigabytes of writeback to be
+ * incorrectly attributed).
+ *
+ * To resolve this issue, cgroup writeback detects the majority dirtier of
+ * an inode and transfers the ownership to it.  To avoid unnnecessary
+ * oscillation, the detection mechanism keeps track of history and gives
+ * out the switch verdict only if the foreign usage pattern is stable over
+ * a certain amount of time and/or writeback attempts.
+ *
+ * On each writeback attempt, @wbc tries to detect the majority writer
+ * using Boyer-Moore majority vote algorithm.  In addition to the byte
+ * count from the majority voting, it also counts the bytes written for the
+ * current wb and the last round's winner wb (max of last round's current
+ * wb, the winner from two rounds ago, and the last round's majority
+ * candidate).  Keeping track of the historical winner helps the algorithm
+ * to semi-reliably detect the most active writer even when it's not the
+ * absolute majority.
+ *
+ * Once the winner of the round is determined, whether the winner is
+ * foreign or not and how much IO time the round consumed is recorded in
+ * inode->i_wb_frn_history.  If the amount of recorded foreign IO time is
+ * over a certain threshold, the switch verdict is given.
+ */
+void wbc_detach_inode(struct writeback_control *wbc)
+{
+	struct bdi_writeback *wb = wbc->wb;
+	struct inode *inode = wbc->inode;
+	u16 history = inode->i_wb_frn_history;
+	unsigned long avg_time = inode->i_wb_frn_avg_time;
+	unsigned long max_bytes, max_time;
+	int max_id;
+
+	/* pick the winner of this round */
+	if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
+	    wbc->wb_bytes >= wbc->wb_tcand_bytes) {
+		max_id = wbc->wb_id;
+		max_bytes = wbc->wb_bytes;
+	} else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
+		max_id = wbc->wb_lcand_id;
+		max_bytes = wbc->wb_lcand_bytes;
+	} else {
+		max_id = wbc->wb_tcand_id;
+		max_bytes = wbc->wb_tcand_bytes;
+	}
+
+	/*
+	 * Calculate the amount of IO time the winner consumed and fold it
+	 * into the running average kept per inode.  If the consumed IO
+	 * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for
+	 * deciding whether to switch or not.  This is to prevent one-off
+	 * small dirtiers from skewing the verdict.
+	 */
+	max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
+				wb->avg_write_bandwidth);
+	if (avg_time)
+		avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
+			    (avg_time >> WB_FRN_TIME_AVG_SHIFT);
+	else
+		avg_time = max_time;	/* immediate catch up on first run */
+
+	if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
+		int slots;
+
+		/*
+		 * The switch verdict is reached if foreign wb's consume
+		 * more than a certain proportion of IO time in a
+		 * WB_FRN_TIME_PERIOD.  This is loosely tracked by 16 slot
+		 * history mask where each bit represents one sixteenth of
+		 * the period.  Determine the number of slots to shift into
+		 * history from @max_time.
+		 */
+		slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
+			    (unsigned long)WB_FRN_HIST_MAX_SLOTS);
+		history <<= slots;
+		if (wbc->wb_id != max_id)
+			history |= (1U << slots) - 1;
+
+		/*
+		 * Switch if the current wb isn't the consistent winner.
+		 * If there are multiple closely competing dirtiers, the
+		 * inode may switch across them repeatedly over time, which
+		 * is okay.  The main goal is avoiding keeping an inode on
+		 * the wrong wb for an extended period of time.
+		 */
+		if (hweight32(history) > WB_FRN_HIST_THR_SLOTS)
+			inode_switch_wbs(inode, max_id);
+	}
+
+	/*
+	 * Multiple instances of this function may race to update the
+	 * following fields but we don't mind occassional inaccuracies.
+	 */
+	inode->i_wb_frn_winner = max_id;
+	inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
+	inode->i_wb_frn_history = history;
+
+	wb_put(wbc->wb);
+	wbc->wb = NULL;
+}
+
+/**
+ * wbc_account_io - account IO issued during writeback
+ * @wbc: writeback_control of the writeback in progress
+ * @page: page being written out
+ * @bytes: number of bytes being written out
+ *
+ * @bytes from @page are about to written out during the writeback
+ * controlled by @wbc.  Keep the book for foreign inode detection.  See
+ * wbc_detach_inode().
+ */
+void wbc_account_io(struct writeback_control *wbc, struct page *page,
+		    size_t bytes)
+{
+	int id;
+
+	/*
+	 * pageout() path doesn't attach @wbc to the inode being written
+	 * out.  This is intentional as we don't want the function to block
+	 * behind a slow cgroup.  Ultimately, we want pageout() to kick off
+	 * regular writeback instead of writing things out itself.
+	 */
+	if (!wbc->wb)
+		return;
+
+	rcu_read_lock();
+	id = mem_cgroup_css_from_page(page)->id;
+	rcu_read_unlock();
+
+	if (id == wbc->wb_id) {
+		wbc->wb_bytes += bytes;
+		return;
+	}
+
+	if (id == wbc->wb_lcand_id)
+		wbc->wb_lcand_bytes += bytes;
+
+	/* Boyer-Moore majority vote algorithm */
+	if (!wbc->wb_tcand_bytes)
+		wbc->wb_tcand_id = id;
+	if (id == wbc->wb_tcand_id)
+		wbc->wb_tcand_bytes += bytes;
+	else
+		wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
+}
+
+/**
+ * inode_congested - test whether an inode is congested
+ * @inode: inode to test for congestion
+ * @cong_bits: mask of WB_[a]sync_congested bits to test
+ *
+ * Tests whether @inode is congested.  @cong_bits is the mask of congestion
+ * bits to test and the return value is the mask of set bits.
+ *
+ * If cgroup writeback is enabled for @inode, the congestion state is
+ * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg
+ * associated with @inode is congested; otherwise, the root wb's congestion
+ * state is used.
+ */
+int inode_congested(struct inode *inode, int cong_bits)
+{
+	/*
+	 * Once set, ->i_wb never becomes NULL while the inode is alive.
+	 * Start transaction iff ->i_wb is visible.
+	 */
+	if (inode && inode_to_wb_is_valid(inode)) {
+		struct bdi_writeback *wb;
+		bool locked, congested;
+
+		wb = unlocked_inode_to_wb_begin(inode, &locked);
+		congested = wb_congested(wb, cong_bits);
+		unlocked_inode_to_wb_end(inode, locked);
+		return congested;
+	}
+
+	return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
+}
+EXPORT_SYMBOL_GPL(inode_congested);
+
+/**
+ * wb_wait_for_single_work - wait for completion of a single bdi_writeback_work
+ * @bdi: bdi the work item was issued to
+ * @work: work item to wait for
+ *
+ * Wait for the completion of @work which was issued to one of @bdi's
+ * bdi_writeback's.  The caller must have set @work->single_wait before
+ * issuing it.  This wait operates independently fo
+ * wb_wait_for_completion() and also disables automatic freeing of @work.
+ */
+static void wb_wait_for_single_work(struct backing_dev_info *bdi,
+				    struct wb_writeback_work *work)
+{
+	if (WARN_ON_ONCE(!work->single_wait))
+		return;
+
+	wait_event(bdi->wb_waitq, work->single_done);
+
+	/*
+	 * Paired with smp_wmb() in wb_do_writeback() and ensures that all
+	 * modifications to @work prior to assertion of ->single_done is
+	 * visible to the caller once this function returns.
+	 */
+	smp_rmb();
+}
+
+/**
+ * wb_split_bdi_pages - split nr_pages to write according to bandwidth
+ * @wb: target bdi_writeback to split @nr_pages to
+ * @nr_pages: number of pages to write for the whole bdi
+ *
+ * Split @wb's portion of @nr_pages according to @wb's write bandwidth in
+ * relation to the total write bandwidth of all wb's w/ dirty inodes on
+ * @wb->bdi.
+ */
+static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
+{
+	unsigned long this_bw = wb->avg_write_bandwidth;
+	unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
+
+	if (nr_pages == LONG_MAX)
+		return LONG_MAX;
+
+	/*
+	 * This may be called on clean wb's and proportional distribution
+	 * may not make sense, just use the original @nr_pages in those
+	 * cases.  In general, we wanna err on the side of writing more.
+	 */
+	if (!tot_bw || this_bw >= tot_bw)
+		return nr_pages;
+	else
+		return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
+}
+
+/**
+ * wb_clone_and_queue_work - clone a wb_writeback_work and issue it to a wb
+ * @wb: target bdi_writeback
+ * @base_work: source wb_writeback_work
+ *
+ * Try to make a clone of @base_work and issue it to @wb.  If cloning
+ * succeeds, %true is returned; otherwise, @base_work is issued directly
+ * and %false is returned.  In the latter case, the caller is required to
+ * wait for @base_work's completion using wb_wait_for_single_work().
+ *
+ * A clone is auto-freed on completion.  @base_work never is.
+ */
+static bool wb_clone_and_queue_work(struct bdi_writeback *wb,
+				    struct wb_writeback_work *base_work)
 {
 	struct wb_writeback_work *work;
 
+	work = kmalloc(sizeof(*work), GFP_ATOMIC);
+	if (work) {
+		*work = *base_work;
+		work->auto_free = 1;
+		work->single_wait = 0;
+	} else {
+		work = base_work;
+		work->auto_free = 0;
+		work->single_wait = 1;
+	}
+	work->single_done = 0;
+	wb_queue_work(wb, work);
+	return work != base_work;
+}
+
+/**
+ * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi
+ * @bdi: target backing_dev_info
+ * @base_work: wb_writeback_work to issue
+ * @skip_if_busy: skip wb's which already have writeback in progress
+ *
+ * Split and issue @base_work to all wb's (bdi_writeback's) of @bdi which
+ * have dirty inodes.  If @base_work->nr_page isn't %LONG_MAX, it's
+ * distributed to the busy wbs according to each wb's proportion in the
+ * total active write bandwidth of @bdi.
+ */
+static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
+				  struct wb_writeback_work *base_work,
+				  bool skip_if_busy)
+{
+	long nr_pages = base_work->nr_pages;
+	int next_blkcg_id = 0;
+	struct bdi_writeback *wb;
+	struct wb_iter iter;
+
+	might_sleep();
+
+	if (!bdi_has_dirty_io(bdi))
+		return;
+restart:
+	rcu_read_lock();
+	bdi_for_each_wb(wb, bdi, &iter, next_blkcg_id) {
+		if (!wb_has_dirty_io(wb) ||
+		    (skip_if_busy && writeback_in_progress(wb)))
+			continue;
+
+		base_work->nr_pages = wb_split_bdi_pages(wb, nr_pages);
+		if (!wb_clone_and_queue_work(wb, base_work)) {
+			next_blkcg_id = wb->blkcg_css->id + 1;
+			rcu_read_unlock();
+			wb_wait_for_single_work(bdi, base_work);
+			goto restart;
+		}
+	}
+	rcu_read_unlock();
+}
+
+#else	/* CONFIG_CGROUP_WRITEBACK */
+
+static struct bdi_writeback *
+locked_inode_to_wb_and_lock_list(struct inode *inode)
+	__releases(&inode->i_lock)
+	__acquires(&wb->list_lock)
+{
+	struct bdi_writeback *wb = inode_to_wb(inode);
+
+	spin_unlock(&inode->i_lock);
+	spin_lock(&wb->list_lock);
+	return wb;
+}
+
+static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
+	__acquires(&wb->list_lock)
+{
+	struct bdi_writeback *wb = inode_to_wb(inode);
+
+	spin_lock(&wb->list_lock);
+	return wb;
+}
+
+static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
+{
+	return nr_pages;
+}
+
+static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
+				  struct wb_writeback_work *base_work,
+				  bool skip_if_busy)
+{
+	might_sleep();
+
+	if (bdi_has_dirty_io(bdi) &&
+	    (!skip_if_busy || !writeback_in_progress(&bdi->wb))) {
+		base_work->auto_free = 0;
+		base_work->single_wait = 0;
+		base_work->single_done = 0;
+		wb_queue_work(&bdi->wb, base_work);
+	}
+}
+
+#endif	/* CONFIG_CGROUP_WRITEBACK */
+
+void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
+			bool range_cyclic, enum wb_reason reason)
+{
+	struct wb_writeback_work *work;
+
+	if (!wb_has_dirty_io(wb))
+		return;
+
 	/*
 	 * This is WB_SYNC_NONE writeback, so if allocation fails just
 	 * wakeup the thread for old dirty data writeback
 	 */
 	work = kzalloc(sizeof(*work), GFP_ATOMIC);
 	if (!work) {
-		trace_writeback_nowork(bdi);
-		bdi_wakeup_thread(bdi);
+		trace_writeback_nowork(wb->bdi);
+		wb_wakeup(wb);
 		return;
 	}
 
@@ -155,46 +922,29 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 	work->nr_pages	= nr_pages;
 	work->range_cyclic = range_cyclic;
 	work->reason	= reason;
+	work->auto_free	= 1;
 
-	bdi_queue_work(bdi, work);
-}
-
-/**
- * bdi_start_writeback - start writeback
- * @bdi: the backing device to write from
- * @nr_pages: the number of pages to write
- * @reason: reason why some writeback work was initiated
- *
- * Description:
- *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
- *   started when this function returns, we make no guarantees on
- *   completion. Caller need not hold sb s_umount semaphore.
- *
- */
-void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
-			enum wb_reason reason)
-{
-	__bdi_start_writeback(bdi, nr_pages, true, reason);
+	wb_queue_work(wb, work);
 }
 
 /**
- * bdi_start_background_writeback - start background writeback
- * @bdi: the backing device to write from
+ * wb_start_background_writeback - start background writeback
+ * @wb: bdi_writback to write from
  *
  * Description:
  *   This makes sure WB_SYNC_NONE background writeback happens. When
- *   this function returns, it is only guaranteed that for given BDI
+ *   this function returns, it is only guaranteed that for given wb
  *   some IO is happening if we are over background dirty threshold.
  *   Caller need not hold sb s_umount semaphore.
  */
-void bdi_start_background_writeback(struct backing_dev_info *bdi)
+void wb_start_background_writeback(struct bdi_writeback *wb)
 {
 	/*
 	 * We just wake up the flusher thread. It will perform background
 	 * writeback as soon as there is no other work to do.
 	 */
-	trace_writeback_wake_background(bdi);
-	bdi_wakeup_thread(bdi);
+	trace_writeback_wake_background(wb->bdi);
+	wb_wakeup(wb);
 }
 
 /*
@@ -202,11 +952,11 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
  */
 void inode_wb_list_del(struct inode *inode)
 {
-	struct backing_dev_info *bdi = inode_to_bdi(inode);
+	struct bdi_writeback *wb;
 
-	spin_lock(&bdi->wb.list_lock);
-	list_del_init(&inode->i_wb_list);
-	spin_unlock(&bdi->wb.list_lock);
+	wb = inode_to_wb_and_lock_list(inode);
+	inode_wb_list_del_locked(inode, wb);
+	spin_unlock(&wb->list_lock);
 }
 
 /*
@@ -220,7 +970,6 @@ void inode_wb_list_del(struct inode *inode)
  */
 static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
 {
-	assert_spin_locked(&wb->list_lock);
 	if (!list_empty(&wb->b_dirty)) {
 		struct inode *tail;
 
@@ -228,7 +977,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
 		if (time_before(inode->dirtied_when, tail->dirtied_when))
 			inode->dirtied_when = jiffies;
 	}
-	list_move(&inode->i_wb_list, &wb->b_dirty);
+	inode_wb_list_move_locked(inode, wb, &wb->b_dirty);
 }
 
 /*
@@ -236,8 +985,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
  */
 static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
 {
-	assert_spin_locked(&wb->list_lock);
-	list_move(&inode->i_wb_list, &wb->b_more_io);
+	inode_wb_list_move_locked(inode, wb, &wb->b_more_io);
 }
 
 static void inode_sync_complete(struct inode *inode)
@@ -346,6 +1094,8 @@ static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
 	moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
 	moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
 				     EXPIRE_DIRTY_ATIME, work);
+	if (moved)
+		wb_io_lists_populated(wb);
 	trace_writeback_queue_io(wb, work, moved);
 }
 
@@ -471,10 +1221,10 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
 		redirty_tail(inode, wb);
 	} else if (inode->i_state & I_DIRTY_TIME) {
 		inode->dirtied_when = jiffies;
-		list_move(&inode->i_wb_list, &wb->b_dirty_time);
+		inode_wb_list_move_locked(inode, wb, &wb->b_dirty_time);
 	} else {
 		/* The inode is clean. Remove from writeback lists. */
-		list_del_init(&inode->i_wb_list);
+		inode_wb_list_del_locked(inode, wb);
 	}
 }
 
@@ -605,10 +1355,11 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
 	     !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
 		goto out;
 	inode->i_state |= I_SYNC;
-	spin_unlock(&inode->i_lock);
+	wbc_attach_and_unlock_inode(wbc, inode);
 
 	ret = __writeback_single_inode(inode, wbc);
 
+	wbc_detach_inode(wbc);
 	spin_lock(&wb->list_lock);
 	spin_lock(&inode->i_lock);
 	/*
@@ -616,7 +1367,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
 	 * touch it. See comment above for explanation.
 	 */
 	if (!(inode->i_state & I_DIRTY_ALL))
-		list_del_init(&inode->i_wb_list);
+		inode_wb_list_del_locked(inode, wb);
 	spin_unlock(&wb->list_lock);
 	inode_sync_complete(inode);
 out:
@@ -624,7 +1375,7 @@ out:
 	return ret;
 }
 
-static long writeback_chunk_size(struct backing_dev_info *bdi,
+static long writeback_chunk_size(struct bdi_writeback *wb,
 				 struct wb_writeback_work *work)
 {
 	long pages;
@@ -645,8 +1396,8 @@ static long writeback_chunk_size(struct backing_dev_info *bdi,
 	if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
 		pages = LONG_MAX;
 	else {
-		pages = min(bdi->avg_write_bandwidth / 2,
-			    global_dirty_limit / DIRTY_SCOPE);
+		pages = min(wb->avg_write_bandwidth / 2,
+			    global_wb_domain.dirty_limit / DIRTY_SCOPE);
 		pages = min(pages, work->nr_pages);
 		pages = round_down(pages + MIN_WRITEBACK_PAGES,
 				   MIN_WRITEBACK_PAGES);
@@ -741,9 +1492,9 @@ static long writeback_sb_inodes(struct super_block *sb,
 			continue;
 		}
 		inode->i_state |= I_SYNC;
-		spin_unlock(&inode->i_lock);
+		wbc_attach_and_unlock_inode(&wbc, inode);
 
-		write_chunk = writeback_chunk_size(wb->bdi, work);
+		write_chunk = writeback_chunk_size(wb, work);
 		wbc.nr_to_write = write_chunk;
 		wbc.pages_skipped = 0;
 
@@ -753,6 +1504,7 @@ static long writeback_sb_inodes(struct super_block *sb,
 		 */
 		__writeback_single_inode(inode, &wbc);
 
+		wbc_detach_inode(&wbc);
 		work->nr_pages -= write_chunk - wbc.nr_to_write;
 		wrote += write_chunk - wbc.nr_to_write;
 		spin_lock(&wb->list_lock);
@@ -830,33 +1582,6 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
 	return nr_pages - work.nr_pages;
 }
 
-static bool over_bground_thresh(struct backing_dev_info *bdi)
-{
-	unsigned long background_thresh, dirty_thresh;
-
-	global_dirty_limits(&background_thresh, &dirty_thresh);
-
-	if (global_page_state(NR_FILE_DIRTY) +
-	    global_page_state(NR_UNSTABLE_NFS) > background_thresh)
-		return true;
-
-	if (bdi_stat(bdi, BDI_RECLAIMABLE) >
-				bdi_dirty_limit(bdi, background_thresh))
-		return true;
-
-	return false;
-}
-
-/*
- * Called under wb->list_lock. If there are multiple wb per bdi,
- * only the flusher working on the first wb should do it.
- */
-static void wb_update_bandwidth(struct bdi_writeback *wb,
-				unsigned long start_time)
-{
-	__bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, 0, start_time);
-}
-
 /*
  * Explicit flushing or periodic writeback of "old" data.
  *
@@ -899,14 +1624,14 @@ static long wb_writeback(struct bdi_writeback *wb,
 		 * after the other works are all done.
 		 */
 		if ((work->for_background || work->for_kupdate) &&
-		    !list_empty(&wb->bdi->work_list))
+		    !list_empty(&wb->work_list))
 			break;
 
 		/*
 		 * For background writeout, stop when we are below the
 		 * background dirty threshold
 		 */
-		if (work->for_background && !over_bground_thresh(wb->bdi))
+		if (work->for_background && !wb_over_bg_thresh(wb))
 			break;
 
 		/*
@@ -970,18 +1695,17 @@ static long wb_writeback(struct bdi_writeback *wb,
 /*
  * Return the next wb_writeback_work struct that hasn't been processed yet.
  */
-static struct wb_writeback_work *
-get_next_work_item(struct backing_dev_info *bdi)
+static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
 {
 	struct wb_writeback_work *work = NULL;
 
-	spin_lock_bh(&bdi->wb_lock);
-	if (!list_empty(&bdi->work_list)) {
-		work = list_entry(bdi->work_list.next,
+	spin_lock_bh(&wb->work_lock);
+	if (!list_empty(&wb->work_list)) {
+		work = list_entry(wb->work_list.next,
 				  struct wb_writeback_work, list);
 		list_del_init(&work->list);
 	}
-	spin_unlock_bh(&bdi->wb_lock);
+	spin_unlock_bh(&wb->work_lock);
 	return work;
 }
 
@@ -998,7 +1722,7 @@ static unsigned long get_nr_dirty_pages(void)
 
 static long wb_check_background_flush(struct bdi_writeback *wb)
 {
-	if (over_bground_thresh(wb->bdi)) {
+	if (wb_over_bg_thresh(wb)) {
 
 		struct wb_writeback_work work = {
 			.nr_pages	= LONG_MAX,
@@ -1053,25 +1777,33 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
  */
 static long wb_do_writeback(struct bdi_writeback *wb)
 {
-	struct backing_dev_info *bdi = wb->bdi;
 	struct wb_writeback_work *work;
 	long wrote = 0;
 
-	set_bit(BDI_writeback_running, &wb->bdi->state);
-	while ((work = get_next_work_item(bdi)) != NULL) {
+	set_bit(WB_writeback_running, &wb->state);
+	while ((work = get_next_work_item(wb)) != NULL) {
+		struct wb_completion *done = work->done;
+		bool need_wake_up = false;
 
-		trace_writeback_exec(bdi, work);
+		trace_writeback_exec(wb->bdi, work);
 
 		wrote += wb_writeback(wb, work);
 
-		/*
-		 * Notify the caller of completion if this is a synchronous
-		 * work item, otherwise just free it.
-		 */
-		if (work->done)
-			complete(work->done);
-		else
+		if (work->single_wait) {
+			WARN_ON_ONCE(work->auto_free);
+			/* paired w/ rmb in wb_wait_for_single_work() */
+			smp_wmb();
+			work->single_done = 1;
+			need_wake_up = true;
+		} else if (work->auto_free) {
 			kfree(work);
+		}
+
+		if (done && atomic_dec_and_test(&done->cnt))
+			need_wake_up = true;
+
+		if (need_wake_up)
+			wake_up_all(&wb->bdi->wb_waitq);
 	}
 
 	/*
@@ -1079,7 +1811,7 @@ static long wb_do_writeback(struct bdi_writeback *wb)
 	 */
 	wrote += wb_check_old_data_flush(wb);
 	wrote += wb_check_background_flush(wb);
-	clear_bit(BDI_writeback_running, &wb->bdi->state);
+	clear_bit(WB_writeback_running, &wb->state);
 
 	return wrote;
 }
@@ -1088,43 +1820,42 @@ static long wb_do_writeback(struct bdi_writeback *wb)
  * Handle writeback of dirty data for the device backed by this bdi. Also
  * reschedules periodically and does kupdated style flushing.
  */
-void bdi_writeback_workfn(struct work_struct *work)
+void wb_workfn(struct work_struct *work)
 {
 	struct bdi_writeback *wb = container_of(to_delayed_work(work),
 						struct bdi_writeback, dwork);
-	struct backing_dev_info *bdi = wb->bdi;
 	long pages_written;
 
-	set_worker_desc("flush-%s", dev_name(bdi->dev));
+	set_worker_desc("flush-%s", dev_name(wb->bdi->dev));
 	current->flags |= PF_SWAPWRITE;
 
 	if (likely(!current_is_workqueue_rescuer() ||
-		   !test_bit(BDI_registered, &bdi->state))) {
+		   !test_bit(WB_registered, &wb->state))) {
 		/*
-		 * The normal path.  Keep writing back @bdi until its
+		 * The normal path.  Keep writing back @wb until its
 		 * work_list is empty.  Note that this path is also taken
-		 * if @bdi is shutting down even when we're running off the
+		 * if @wb is shutting down even when we're running off the
 		 * rescuer as work_list needs to be drained.
 		 */
 		do {
 			pages_written = wb_do_writeback(wb);
 			trace_writeback_pages_written(pages_written);
-		} while (!list_empty(&bdi->work_list));
+		} while (!list_empty(&wb->work_list));
 	} else {
 		/*
 		 * bdi_wq can't get enough workers and we're running off
 		 * the emergency worker.  Don't hog it.  Hopefully, 1024 is
 		 * enough for efficient IO.
 		 */
-		pages_written = writeback_inodes_wb(&bdi->wb, 1024,
+		pages_written = writeback_inodes_wb(wb, 1024,
 						    WB_REASON_FORKER_THREAD);
 		trace_writeback_pages_written(pages_written);
 	}
 
-	if (!list_empty(&bdi->work_list))
+	if (!list_empty(&wb->work_list))
 		mod_delayed_work(bdi_wq, &wb->dwork, 0);
 	else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
-		bdi_wakeup_thread_delayed(bdi);
+		wb_wakeup_delayed(wb);
 
 	current->flags &= ~PF_SWAPWRITE;
 }
@@ -1142,9 +1873,15 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
+		struct bdi_writeback *wb;
+		struct wb_iter iter;
+
 		if (!bdi_has_dirty_io(bdi))
 			continue;
-		__bdi_start_writeback(bdi, nr_pages, false, reason);
+
+		bdi_for_each_wb(wb, bdi, &iter, 0)
+			wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
+					   false, reason);
 	}
 	rcu_read_unlock();
 }
@@ -1173,9 +1910,12 @@ static void wakeup_dirtytime_writeback(struct work_struct *w)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
-		if (list_empty(&bdi->wb.b_dirty_time))
-			continue;
-		bdi_wakeup_thread(bdi);
+		struct bdi_writeback *wb;
+		struct wb_iter iter;
+
+		bdi_for_each_wb(wb, bdi, &iter, 0)
+			if (!list_empty(&bdi->wb.b_dirty_time))
+				wb_wakeup(&bdi->wb);
 	}
 	rcu_read_unlock();
 	schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
@@ -1249,7 +1989,6 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
 void __mark_inode_dirty(struct inode *inode, int flags)
 {
 	struct super_block *sb = inode->i_sb;
-	struct backing_dev_info *bdi = NULL;
 	int dirtytime;
 
 	trace_writeback_mark_inode_dirty(inode, flags);
@@ -1289,6 +2028,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 	if ((inode->i_state & flags) != flags) {
 		const int was_dirty = inode->i_state & I_DIRTY;
 
+		inode_attach_wb(inode, NULL);
+
 		if (flags & I_DIRTY_INODE)
 			inode->i_state &= ~I_DIRTY_TIME;
 		inode->i_state |= flags;
@@ -1317,38 +2058,39 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 		 * reposition it (that would break b_dirty time-ordering).
 		 */
 		if (!was_dirty) {
+			struct bdi_writeback *wb;
+			struct list_head *dirty_list;
 			bool wakeup_bdi = false;
-			bdi = inode_to_bdi(inode);
 
-			spin_unlock(&inode->i_lock);
-			spin_lock(&bdi->wb.list_lock);
-			if (bdi_cap_writeback_dirty(bdi)) {
-				WARN(!test_bit(BDI_registered, &bdi->state),
-				     "bdi-%s not registered\n", bdi->name);
+			wb = locked_inode_to_wb_and_lock_list(inode);
 
-				/*
-				 * If this is the first dirty inode for this
-				 * bdi, we have to wake-up the corresponding
-				 * bdi thread to make sure background
-				 * write-back happens later.
-				 */
-				if (!wb_has_dirty_io(&bdi->wb))
-					wakeup_bdi = true;
-			}
+			WARN(bdi_cap_writeback_dirty(wb->bdi) &&
+			     !test_bit(WB_registered, &wb->state),
+			     "bdi-%s not registered\n", wb->bdi->name);
 
 			inode->dirtied_when = jiffies;
 			if (dirtytime)
 				inode->dirtied_time_when = jiffies;
+
 			if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES))
-				list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
+				dirty_list = &wb->b_dirty;
 			else
-				list_move(&inode->i_wb_list,
-					  &bdi->wb.b_dirty_time);
-			spin_unlock(&bdi->wb.list_lock);
+				dirty_list = &wb->b_dirty_time;
+
+			wakeup_bdi = inode_wb_list_move_locked(inode, wb,
+							       dirty_list);
+
+			spin_unlock(&wb->list_lock);
 			trace_writeback_dirty_inode_enqueue(inode);
 
-			if (wakeup_bdi)
-				bdi_wakeup_thread_delayed(bdi);
+			/*
+			 * If this is the first dirty inode for this bdi,
+			 * we have to wake-up the corresponding bdi thread
+			 * to make sure background write-back happens
+			 * later.
+			 */
+			if (bdi_cap_writeback_dirty(wb->bdi) && wakeup_bdi)
+				wb_wakeup_delayed(wb);
 			return;
 		}
 	}
@@ -1411,6 +2153,28 @@ static void wait_sb_inodes(struct super_block *sb)
 	iput(old_inode);
 }
 
+static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
+				     enum wb_reason reason, bool skip_if_busy)
+{
+	DEFINE_WB_COMPLETION_ONSTACK(done);
+	struct wb_writeback_work work = {
+		.sb			= sb,
+		.sync_mode		= WB_SYNC_NONE,
+		.tagged_writepages	= 1,
+		.done			= &done,
+		.nr_pages		= nr,
+		.reason			= reason,
+	};
+	struct backing_dev_info *bdi = sb->s_bdi;
+
+	if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
+		return;
+	WARN_ON(!rwsem_is_locked(&sb->s_umount));
+
+	bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
+	wb_wait_for_completion(bdi, &done);
+}
+
 /**
  * writeback_inodes_sb_nr -	writeback dirty inodes from given super_block
  * @sb: the superblock
@@ -1425,21 +2189,7 @@ void writeback_inodes_sb_nr(struct super_block *sb,
 			    unsigned long nr,
 			    enum wb_reason reason)
 {
-	DECLARE_COMPLETION_ONSTACK(done);
-	struct wb_writeback_work work = {
-		.sb			= sb,
-		.sync_mode		= WB_SYNC_NONE,
-		.tagged_writepages	= 1,
-		.done			= &done,
-		.nr_pages		= nr,
-		.reason			= reason,
-	};
-
-	if (sb->s_bdi == &noop_backing_dev_info)
-		return;
-	WARN_ON(!rwsem_is_locked(&sb->s_umount));
-	bdi_queue_work(sb->s_bdi, &work);
-	wait_for_completion(&done);
+	__writeback_inodes_sb_nr(sb, nr, reason, false);
 }
 EXPORT_SYMBOL(writeback_inodes_sb_nr);
 
@@ -1467,19 +2217,15 @@ EXPORT_SYMBOL(writeback_inodes_sb);
  * Invoke writeback_inodes_sb_nr if no writeback is currently underway.
  * Returns 1 if writeback was started, 0 if not.
  */
-int try_to_writeback_inodes_sb_nr(struct super_block *sb,
-				  unsigned long nr,
-				  enum wb_reason reason)
+bool try_to_writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
+				   enum wb_reason reason)
 {
-	if (writeback_in_progress(sb->s_bdi))
-		return 1;
-
 	if (!down_read_trylock(&sb->s_umount))
-		return 0;
+		return false;
 
-	writeback_inodes_sb_nr(sb, nr, reason);
+	__writeback_inodes_sb_nr(sb, nr, reason, true);
 	up_read(&sb->s_umount);
-	return 1;
+	return true;
 }
 EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);
 
@@ -1491,7 +2237,7 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);
  * Implement by try_to_writeback_inodes_sb_nr()
  * Returns 1 if writeback was started, 0 if not.
  */
-int try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
+bool try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
 {
 	return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
 }
@@ -1506,7 +2252,7 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb);
  */
 void sync_inodes_sb(struct super_block *sb)
 {
-	DECLARE_COMPLETION_ONSTACK(done);
+	DEFINE_WB_COMPLETION_ONSTACK(done);
 	struct wb_writeback_work work = {
 		.sb		= sb,
 		.sync_mode	= WB_SYNC_ALL,
@@ -1516,14 +2262,15 @@ void sync_inodes_sb(struct super_block *sb)
 		.reason		= WB_REASON_SYNC,
 		.for_sync	= 1,
 	};
+	struct backing_dev_info *bdi = sb->s_bdi;
 
 	/* Nothing to do? */
-	if (sb->s_bdi == &noop_backing_dev_info)
+	if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
 		return;
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
-	bdi_queue_work(sb->s_bdi, &work);
-	wait_for_completion(&done);
+	bdi_split_work_to_wbs(bdi, &work, false);
+	wb_wait_for_completion(bdi, &done);
 
 	wait_sb_inodes(sb);
 }
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 64835cf589363e..014fa8ba2b5189 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1445,9 +1445,9 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
 
 	list_del(&req->writepages_entry);
 	for (i = 0; i < req->num_pages; i++) {
-		dec_bdi_stat(bdi, BDI_WRITEBACK);
+		dec_wb_stat(&bdi->wb, WB_WRITEBACK);
 		dec_zone_page_state(req->pages[i], NR_WRITEBACK_TEMP);
-		bdi_writeout_inc(bdi);
+		wb_writeout_inc(&bdi->wb);
 	}
 	wake_up(&fi->page_waitq);
 }
@@ -1635,7 +1635,7 @@ static int fuse_writepage_locked(struct page *page)
 	req->end = fuse_writepage_end;
 	req->inode = inode;
 
-	inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
+	inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
 	inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
 
 	spin_lock(&fc->lock);
@@ -1749,9 +1749,9 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
 		copy_highpage(old_req->pages[0], page);
 		spin_unlock(&fc->lock);
 
-		dec_bdi_stat(bdi, BDI_WRITEBACK);
+		dec_wb_stat(&bdi->wb, WB_WRITEBACK);
 		dec_zone_page_state(page, NR_WRITEBACK_TEMP);
-		bdi_writeout_inc(bdi);
+		wb_writeout_inc(&bdi->wb);
 		fuse_writepage_free(fc, new_req);
 		fuse_request_free(new_req);
 		goto out;
@@ -1848,7 +1848,7 @@ static int fuse_writepages_fill(struct page *page,
 	req->page_descs[req->num_pages].offset = 0;
 	req->page_descs[req->num_pages].length = PAGE_SIZE;
 
-	inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
+	inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
 	inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
 
 	err = 0;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 859c6edbf81a07..2982445947e174 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -748,7 +748,7 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
 
 	if (wbc->sync_mode == WB_SYNC_ALL)
 		gfs2_log_flush(GFS2_SB(inode), ip->i_gl, NORMAL_FLUSH);
-	if (bdi->dirty_exceeded)
+	if (bdi->wb.dirty_exceeded)
 		gfs2_ail1_flush(sdp, wbc);
 	else
 		filemap_fdatawrite(metamapping);
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index eee7206c38d18e..55c03b9e90708e 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -14,6 +14,7 @@
 
 #include <linux/module.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/mount.h>
 #include <linux/init.h>
 #include <linux/nls.h>
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 593af2fdcc2daf..7302d96ae8bfd7 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -11,6 +11,7 @@
 #include <linux/init.h>
 #include <linux/pagemap.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/vfs.h>
diff --git a/fs/inode.c b/fs/inode.c
index e8d62688ed9181..069721f0cc0e0b 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -224,6 +224,7 @@ EXPORT_SYMBOL(free_inode_nonrcu);
 void __destroy_inode(struct inode *inode)
 {
 	BUG_ON(inode_has_buffers(inode));
+	inode_detach_wb(inode);
 	security_inode_free(inode);
 	fsnotify_inode_delete(inode);
 	locks_free_lock_context(inode->i_flctx);
diff --git a/fs/mpage.c b/fs/mpage.c
index 3e79220babac28..ca0244b69de8ba 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -605,6 +605,8 @@ alloc_new:
 				bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH);
 		if (bio == NULL)
 			goto confused;
+
+		wbc_init_bio(wbc, bio);
 	}
 
 	/*
@@ -612,6 +614,7 @@ alloc_new:
 	 * the confused fail path above (OOM) will be very confused when
 	 * it finds all bh marked clean (i.e. it will not write anything)
 	 */
+	wbc_account_io(wbc, page, PAGE_SIZE);
 	length = first_unmapped << blkbits;
 	if (bio_add_page(bio, page, length, 0) < length) {
 		bio = mpage_bio_submit(WRITE, bio);
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index a46bf6de9ce455..b34f2e22860168 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -32,6 +32,7 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
 #include <linux/module.h>
+#include <linux/backing-dev.h>
 
 #include <linux/sunrpc/metrics.h>
 
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 9e6475bc5ba22b..7e3c4604bea8a6 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -607,7 +607,7 @@ void nfs_mark_page_unstable(struct page *page)
 	struct inode *inode = page_file_mapping(page)->host;
 
 	inc_zone_page_state(page, NR_UNSTABLE_NFS);
-	inc_bdi_stat(inode_to_bdi(inode), BDI_RECLAIMABLE);
+	inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE);
 	 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 }
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index dfc19f1575a19d..e6c262555e08a6 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -853,7 +853,8 @@ static void
 nfs_clear_page_commit(struct page *page)
 {
 	dec_zone_page_state(page, NR_UNSTABLE_NFS);
-	dec_bdi_stat(inode_to_bdi(page_file_mapping(page)->host), BDI_RECLAIMABLE);
+	dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb,
+		    WB_RECLAIMABLE);
 }
 
 /* Called holding inode (/cinfo) lock */
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index dc3a9efdaab877..42468e5ab3e71a 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -343,11 +343,6 @@ static void nilfs_end_bio_write(struct bio *bio, int err)
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct nilfs_segment_buffer *segbuf = bio->bi_private;
 
-	if (err == -EOPNOTSUPP) {
-		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
-		/* to be detected by nilfs_segbuf_submit_bio() */
-	}
-
 	if (!uptodate)
 		atomic_inc(&segbuf->sb_err);
 
@@ -374,15 +369,8 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
 
 	bio->bi_end_io = nilfs_end_bio_write;
 	bio->bi_private = segbuf;
-	bio_get(bio);
 	submit_bio(mode, bio);
 	segbuf->sb_nbio++;
-	if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
-		bio_put(bio);
-		err = -EOPNOTSUPP;
-		goto failed;
-	}
-	bio_put(bio);
 
 	wi->bio = NULL;
 	wi->rest_blocks -= wi->end - wi->start;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index d8b670cbd90929..8f1feca89fb08f 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -37,6 +37,7 @@
 #include <linux/falloc.h>
 #include <linux/quotaops.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 
 #include <cluster/masklog.h>
 
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 0111ad0466ed42..3e0af317fcc3eb 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -21,6 +21,7 @@
 #include "xattr.h"
 #include <linux/init.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
 #include <linux/exportfs.h>
 #include <linux/quotaops.h>
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index b3bc3e7ae79db3..098508a93c7b30 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -80,6 +80,7 @@
 #include <linux/stat.h>
 #include <linux/string.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/init.h>
 #include <linux/parser.h>
 #include <linux/buffer_head.h>
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index dc52698073990a..3859f5e27a4dc2 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -356,7 +356,6 @@ xfs_end_bio(
 {
 	xfs_ioend_t		*ioend = bio->bi_private;
 
-	ASSERT(atomic_read(&bio->bi_cnt) >= 1);
 	ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error;
 
 	/* Toss bio and pass work off to an xfsdatad thread */
@@ -1942,6 +1941,7 @@ xfs_vm_set_page_dirty(
 	loff_t			end_offset;
 	loff_t			offset;
 	int			newly_dirty;
+	struct mem_cgroup	*memcg;
 
 	if (unlikely(!mapping))
 		return !TestSetPageDirty(page);
@@ -1961,6 +1961,11 @@ xfs_vm_set_page_dirty(
 			offset += 1 << inode->i_blkbits;
 		} while (bh != head);
 	}
+	/*
+	 * Use mem_group_begin_page_stat() to keep PageDirty synchronized with
+	 * per-memcg dirty page counters.
+	 */
+	memcg = mem_cgroup_begin_page_stat(page);
 	newly_dirty = !TestSetPageDirty(page);
 	spin_unlock(&mapping->private_lock);
 
@@ -1971,13 +1976,15 @@ xfs_vm_set_page_dirty(
 		spin_lock_irqsave(&mapping->tree_lock, flags);
 		if (page->mapping) {	/* Race with truncate? */
 			WARN_ON_ONCE(!PageUptodate(page));
-			account_page_dirtied(page, mapping);
+			account_page_dirtied(page, mapping, memcg);
 			radix_tree_tag_set(&mapping->page_tree,
 					page_index(page), PAGECACHE_TAG_DIRTY);
 		}
 		spin_unlock_irqrestore(&mapping->tree_lock, flags);
-		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 	}
+	mem_cgroup_end_page_stat(memcg);
+	if (newly_dirty)
+		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 	return newly_dirty;
 }
 
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 97d92c14476835..874507de3485b8 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -41,6 +41,7 @@
 #include <linux/dcache.h>
 #include <linux/falloc.h>
 #include <linux/pagevec.h>
+#include <linux/backing-dev.h>
 
 static const struct vm_operations_struct xfs_file_vm_ops;
 
diff --git a/include/asm-generic/scatterlist.h b/include/asm-generic/scatterlist.h
deleted file mode 100644
index 5de07355fad492..00000000000000
--- a/include/asm-generic/scatterlist.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef __ASM_GENERIC_SCATTERLIST_H
-#define __ASM_GENERIC_SCATTERLIST_H
-
-#include <linux/types.h>
-
-struct scatterlist {
-#ifdef CONFIG_DEBUG_SG
-	unsigned long	sg_magic;
-#endif
-	unsigned long	page_link;
-	unsigned int	offset;
-	unsigned int	length;
-	dma_addr_t	dma_address;
-#ifdef CONFIG_NEED_SG_DMA_LENGTH
-	unsigned int	dma_length;
-#endif
-};
-
-/*
- * These macros should be used after a dma_map_sg call has been done
- * to get bus addresses of each of the SG entries and their lengths.
- * You should only work with the number of sg entries pci_map_sg
- * returns, or alternatively stop on the first sg_dma_len(sg) which
- * is 0.
- */
-#define sg_dma_address(sg)	((sg)->dma_address)
-
-#ifdef CONFIG_NEED_SG_DMA_LENGTH
-#define sg_dma_len(sg)		((sg)->dma_length)
-#else
-#define sg_dma_len(sg)		((sg)->length)
-#endif
-
-#endif /* __ASM_GENERIC_SCATTERLIST_H */
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
new file mode 100644
index 00000000000000..a48d90e3bcbb86
--- /dev/null
+++ b/include/linux/backing-dev-defs.h
@@ -0,0 +1,255 @@
+#ifndef __LINUX_BACKING_DEV_DEFS_H
+#define __LINUX_BACKING_DEV_DEFS_H
+
+#include <linux/list.h>
+#include <linux/radix-tree.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/percpu_counter.h>
+#include <linux/percpu-refcount.h>
+#include <linux/flex_proportions.h>
+#include <linux/timer.h>
+#include <linux/workqueue.h>
+
+struct page;
+struct device;
+struct dentry;
+
+/*
+ * Bits in bdi_writeback.state
+ */
+enum wb_state {
+	WB_registered,		/* bdi_register() was done */
+	WB_writeback_running,	/* Writeback is in progress */
+	WB_has_dirty_io,	/* Dirty inodes on ->b_{dirty|io|more_io} */
+};
+
+enum wb_congested_state {
+	WB_async_congested,	/* The async (write) queue is getting full */
+	WB_sync_congested,	/* The sync queue is getting full */
+};
+
+typedef int (congested_fn)(void *, int);
+
+enum wb_stat_item {
+	WB_RECLAIMABLE,
+	WB_WRITEBACK,
+	WB_DIRTIED,
+	WB_WRITTEN,
+	NR_WB_STAT_ITEMS
+};
+
+#define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
+
+/*
+ * For cgroup writeback, multiple wb's may map to the same blkcg.  Those
+ * wb's can operate mostly independently but should share the congested
+ * state.  To facilitate such sharing, the congested state is tracked using
+ * the following struct which is created on demand, indexed by blkcg ID on
+ * its bdi, and refcounted.
+ */
+struct bdi_writeback_congested {
+	unsigned long state;		/* WB_[a]sync_congested flags */
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+	struct backing_dev_info *bdi;	/* the associated bdi */
+	atomic_t refcnt;		/* nr of attached wb's and blkg */
+	int blkcg_id;			/* ID of the associated blkcg */
+	struct rb_node rb_node;		/* on bdi->cgwb_congestion_tree */
+#endif
+};
+
+/*
+ * Each wb (bdi_writeback) can perform writeback operations, is measured
+ * and throttled, independently.  Without cgroup writeback, each bdi
+ * (bdi_writeback) is served by its embedded bdi->wb.
+ *
+ * On the default hierarchy, blkcg implicitly enables memcg.  This allows
+ * using memcg's page ownership for attributing writeback IOs, and every
+ * memcg - blkcg combination can be served by its own wb by assigning a
+ * dedicated wb to each memcg, which enables isolation across different
+ * cgroups and propagation of IO back pressure down from the IO layer upto
+ * the tasks which are generating the dirty pages to be written back.
+ *
+ * A cgroup wb is indexed on its bdi by the ID of the associated memcg,
+ * refcounted with the number of inodes attached to it, and pins the memcg
+ * and the corresponding blkcg.  As the corresponding blkcg for a memcg may
+ * change as blkcg is disabled and enabled higher up in the hierarchy, a wb
+ * is tested for blkcg after lookup and removed from index on mismatch so
+ * that a new wb for the combination can be created.
+ */
+struct bdi_writeback {
+	struct backing_dev_info *bdi;	/* our parent bdi */
+
+	unsigned long state;		/* Always use atomic bitops on this */
+	unsigned long last_old_flush;	/* last old data flush */
+
+	struct list_head b_dirty;	/* dirty inodes */
+	struct list_head b_io;		/* parked for writeback */
+	struct list_head b_more_io;	/* parked for more writeback */
+	struct list_head b_dirty_time;	/* time stamps are dirty */
+	spinlock_t list_lock;		/* protects the b_* lists */
+
+	struct percpu_counter stat[NR_WB_STAT_ITEMS];
+
+	struct bdi_writeback_congested *congested;
+
+	unsigned long bw_time_stamp;	/* last time write bw is updated */
+	unsigned long dirtied_stamp;
+	unsigned long written_stamp;	/* pages written at bw_time_stamp */
+	unsigned long write_bandwidth;	/* the estimated write bandwidth */
+	unsigned long avg_write_bandwidth; /* further smoothed write bw, > 0 */
+
+	/*
+	 * The base dirty throttle rate, re-calculated on every 200ms.
+	 * All the bdi tasks' dirty rate will be curbed under it.
+	 * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit
+	 * in small steps and is much more smooth/stable than the latter.
+	 */
+	unsigned long dirty_ratelimit;
+	unsigned long balanced_dirty_ratelimit;
+
+	struct fprop_local_percpu completions;
+	int dirty_exceeded;
+
+	spinlock_t work_lock;		/* protects work_list & dwork scheduling */
+	struct list_head work_list;
+	struct delayed_work dwork;	/* work item used for writeback */
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+	struct percpu_ref refcnt;	/* used only for !root wb's */
+	struct fprop_local_percpu memcg_completions;
+	struct cgroup_subsys_state *memcg_css; /* the associated memcg */
+	struct cgroup_subsys_state *blkcg_css; /* and blkcg */
+	struct list_head memcg_node;	/* anchored at memcg->cgwb_list */
+	struct list_head blkcg_node;	/* anchored at blkcg->cgwb_list */
+
+	union {
+		struct work_struct release_work;
+		struct rcu_head rcu;
+	};
+#endif
+};
+
+struct backing_dev_info {
+	struct list_head bdi_list;
+	unsigned long ra_pages;	/* max readahead in PAGE_CACHE_SIZE units */
+	unsigned int capabilities; /* Device capabilities */
+	congested_fn *congested_fn; /* Function pointer if device is md/dm */
+	void *congested_data;	/* Pointer to aux data for congested func */
+
+	char *name;
+
+	unsigned int min_ratio;
+	unsigned int max_ratio, max_prop_frac;
+
+	/*
+	 * Sum of avg_write_bw of wbs with dirty inodes.  > 0 if there are
+	 * any dirty wbs, which is depended upon by bdi_has_dirty().
+	 */
+	atomic_long_t tot_write_bandwidth;
+
+	struct bdi_writeback wb;  /* the root writeback info for this bdi */
+	struct bdi_writeback_congested wb_congested; /* its congested state */
+#ifdef CONFIG_CGROUP_WRITEBACK
+	struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
+	struct rb_root cgwb_congested_tree; /* their congested states */
+	atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */
+#endif
+	wait_queue_head_t wb_waitq;
+
+	struct device *dev;
+
+	struct timer_list laptop_mode_wb_timer;
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *debug_dir;
+	struct dentry *debug_stats;
+#endif
+};
+
+enum {
+	BLK_RW_ASYNC	= 0,
+	BLK_RW_SYNC	= 1,
+};
+
+void clear_wb_congested(struct bdi_writeback_congested *congested, int sync);
+void set_wb_congested(struct bdi_writeback_congested *congested, int sync);
+
+static inline void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
+{
+	clear_wb_congested(bdi->wb.congested, sync);
+}
+
+static inline void set_bdi_congested(struct backing_dev_info *bdi, int sync)
+{
+	set_wb_congested(bdi->wb.congested, sync);
+}
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+/**
+ * wb_tryget - try to increment a wb's refcount
+ * @wb: bdi_writeback to get
+ */
+static inline bool wb_tryget(struct bdi_writeback *wb)
+{
+	if (wb != &wb->bdi->wb)
+		return percpu_ref_tryget(&wb->refcnt);
+	return true;
+}
+
+/**
+ * wb_get - increment a wb's refcount
+ * @wb: bdi_writeback to get
+ */
+static inline void wb_get(struct bdi_writeback *wb)
+{
+	if (wb != &wb->bdi->wb)
+		percpu_ref_get(&wb->refcnt);
+}
+
+/**
+ * wb_put - decrement a wb's refcount
+ * @wb: bdi_writeback to put
+ */
+static inline void wb_put(struct bdi_writeback *wb)
+{
+	if (wb != &wb->bdi->wb)
+		percpu_ref_put(&wb->refcnt);
+}
+
+/**
+ * wb_dying - is a wb dying?
+ * @wb: bdi_writeback of interest
+ *
+ * Returns whether @wb is unlinked and being drained.
+ */
+static inline bool wb_dying(struct bdi_writeback *wb)
+{
+	return percpu_ref_is_dying(&wb->refcnt);
+}
+
+#else	/* CONFIG_CGROUP_WRITEBACK */
+
+static inline bool wb_tryget(struct bdi_writeback *wb)
+{
+	return true;
+}
+
+static inline void wb_get(struct bdi_writeback *wb)
+{
+}
+
+static inline void wb_put(struct bdi_writeback *wb)
+{
+}
+
+static inline bool wb_dying(struct bdi_writeback *wb)
+{
+	return false;
+}
+
+#endif	/* CONFIG_CGROUP_WRITEBACK */
+
+#endif	/* __LINUX_BACKING_DEV_DEFS_H */
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index d87d8eced06407..a5e6695744eac3 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -8,106 +8,13 @@
 #ifndef _LINUX_BACKING_DEV_H
 #define _LINUX_BACKING_DEV_H
 
-#include <linux/percpu_counter.h>
-#include <linux/log2.h>
-#include <linux/flex_proportions.h>
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
-#include <linux/timer.h>
+#include <linux/blkdev.h>
 #include <linux/writeback.h>
-#include <linux/atomic.h>
-#include <linux/sysctl.h>
-#include <linux/workqueue.h>
-
-struct page;
-struct device;
-struct dentry;
-
-/*
- * Bits in backing_dev_info.state
- */
-enum bdi_state {
-	BDI_async_congested,	/* The async (write) queue is getting full */
-	BDI_sync_congested,	/* The sync queue is getting full */
-	BDI_registered,		/* bdi_register() was done */
-	BDI_writeback_running,	/* Writeback is in progress */
-};
-
-typedef int (congested_fn)(void *, int);
-
-enum bdi_stat_item {
-	BDI_RECLAIMABLE,
-	BDI_WRITEBACK,
-	BDI_DIRTIED,
-	BDI_WRITTEN,
-	NR_BDI_STAT_ITEMS
-};
-
-#define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
-
-struct bdi_writeback {
-	struct backing_dev_info *bdi;	/* our parent bdi */
-
-	unsigned long last_old_flush;	/* last old data flush */
-
-	struct delayed_work dwork;	/* work item used for writeback */
-	struct list_head b_dirty;	/* dirty inodes */
-	struct list_head b_io;		/* parked for writeback */
-	struct list_head b_more_io;	/* parked for more writeback */
-	struct list_head b_dirty_time;	/* time stamps are dirty */
-	spinlock_t list_lock;		/* protects the b_* lists */
-};
-
-struct backing_dev_info {
-	struct list_head bdi_list;
-	unsigned long ra_pages;	/* max readahead in PAGE_CACHE_SIZE units */
-	unsigned long state;	/* Always use atomic bitops on this */
-	unsigned int capabilities; /* Device capabilities */
-	congested_fn *congested_fn; /* Function pointer if device is md/dm */
-	void *congested_data;	/* Pointer to aux data for congested func */
-
-	char *name;
-
-	struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
-
-	unsigned long bw_time_stamp;	/* last time write bw is updated */
-	unsigned long dirtied_stamp;
-	unsigned long written_stamp;	/* pages written at bw_time_stamp */
-	unsigned long write_bandwidth;	/* the estimated write bandwidth */
-	unsigned long avg_write_bandwidth; /* further smoothed write bw */
-
-	/*
-	 * The base dirty throttle rate, re-calculated on every 200ms.
-	 * All the bdi tasks' dirty rate will be curbed under it.
-	 * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit
-	 * in small steps and is much more smooth/stable than the latter.
-	 */
-	unsigned long dirty_ratelimit;
-	unsigned long balanced_dirty_ratelimit;
-
-	struct fprop_local_percpu completions;
-	int dirty_exceeded;
-
-	unsigned int min_ratio;
-	unsigned int max_ratio, max_prop_frac;
-
-	struct bdi_writeback wb;  /* default writeback info for this bdi */
-	spinlock_t wb_lock;	  /* protects work_list & wb.dwork scheduling */
-
-	struct list_head work_list;
-
-	struct device *dev;
-
-	struct timer_list laptop_mode_wb_timer;
-
-#ifdef CONFIG_DEBUG_FS
-	struct dentry *debug_dir;
-	struct dentry *debug_stats;
-#endif
-};
-
-struct backing_dev_info *inode_to_bdi(struct inode *inode);
+#include <linux/blk-cgroup.h>
+#include <linux/backing-dev-defs.h>
 
 int __must_check bdi_init(struct backing_dev_info *bdi);
 void bdi_destroy(struct backing_dev_info *bdi);
@@ -117,97 +24,99 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 		const char *fmt, ...);
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
-void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
-			enum wb_reason reason);
-void bdi_start_background_writeback(struct backing_dev_info *bdi);
-void bdi_writeback_workfn(struct work_struct *work);
-int bdi_has_dirty_io(struct backing_dev_info *bdi);
-void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi);
+void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
+			bool range_cyclic, enum wb_reason reason);
+void wb_start_background_writeback(struct bdi_writeback *wb);
+void wb_workfn(struct work_struct *work);
+void wb_wakeup_delayed(struct bdi_writeback *wb);
 
 extern spinlock_t bdi_lock;
 extern struct list_head bdi_list;
 
 extern struct workqueue_struct *bdi_wq;
 
-static inline int wb_has_dirty_io(struct bdi_writeback *wb)
+static inline bool wb_has_dirty_io(struct bdi_writeback *wb)
 {
-	return !list_empty(&wb->b_dirty) ||
-	       !list_empty(&wb->b_io) ||
-	       !list_empty(&wb->b_more_io);
+	return test_bit(WB_has_dirty_io, &wb->state);
+}
+
+static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi)
+{
+	/*
+	 * @bdi->tot_write_bandwidth is guaranteed to be > 0 if there are
+	 * any dirty wbs.  See wb_update_write_bandwidth().
+	 */
+	return atomic_long_read(&bdi->tot_write_bandwidth);
 }
 
-static inline void __add_bdi_stat(struct backing_dev_info *bdi,
-		enum bdi_stat_item item, s64 amount)
+static inline void __add_wb_stat(struct bdi_writeback *wb,
+				 enum wb_stat_item item, s64 amount)
 {
-	__percpu_counter_add(&bdi->bdi_stat[item], amount, BDI_STAT_BATCH);
+	__percpu_counter_add(&wb->stat[item], amount, WB_STAT_BATCH);
 }
 
-static inline void __inc_bdi_stat(struct backing_dev_info *bdi,
-		enum bdi_stat_item item)
+static inline void __inc_wb_stat(struct bdi_writeback *wb,
+				 enum wb_stat_item item)
 {
-	__add_bdi_stat(bdi, item, 1);
+	__add_wb_stat(wb, item, 1);
 }
 
-static inline void inc_bdi_stat(struct backing_dev_info *bdi,
-		enum bdi_stat_item item)
+static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
-	__inc_bdi_stat(bdi, item);
+	__inc_wb_stat(wb, item);
 	local_irq_restore(flags);
 }
 
-static inline void __dec_bdi_stat(struct backing_dev_info *bdi,
-		enum bdi_stat_item item)
+static inline void __dec_wb_stat(struct bdi_writeback *wb,
+				 enum wb_stat_item item)
 {
-	__add_bdi_stat(bdi, item, -1);
+	__add_wb_stat(wb, item, -1);
 }
 
-static inline void dec_bdi_stat(struct backing_dev_info *bdi,
-		enum bdi_stat_item item)
+static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
-	__dec_bdi_stat(bdi, item);
+	__dec_wb_stat(wb, item);
 	local_irq_restore(flags);
 }
 
-static inline s64 bdi_stat(struct backing_dev_info *bdi,
-		enum bdi_stat_item item)
+static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
 {
-	return percpu_counter_read_positive(&bdi->bdi_stat[item]);
+	return percpu_counter_read_positive(&wb->stat[item]);
 }
 
-static inline s64 __bdi_stat_sum(struct backing_dev_info *bdi,
-		enum bdi_stat_item item)
+static inline s64 __wb_stat_sum(struct bdi_writeback *wb,
+				enum wb_stat_item item)
 {
-	return percpu_counter_sum_positive(&bdi->bdi_stat[item]);
+	return percpu_counter_sum_positive(&wb->stat[item]);
 }
 
-static inline s64 bdi_stat_sum(struct backing_dev_info *bdi,
-		enum bdi_stat_item item)
+static inline s64 wb_stat_sum(struct bdi_writeback *wb, enum wb_stat_item item)
 {
 	s64 sum;
 	unsigned long flags;
 
 	local_irq_save(flags);
-	sum = __bdi_stat_sum(bdi, item);
+	sum = __wb_stat_sum(wb, item);
 	local_irq_restore(flags);
 
 	return sum;
 }
 
-extern void bdi_writeout_inc(struct backing_dev_info *bdi);
+extern void wb_writeout_inc(struct bdi_writeback *wb);
 
 /*
  * maximal error of a stat counter.
  */
-static inline unsigned long bdi_stat_error(struct backing_dev_info *bdi)
+static inline unsigned long wb_stat_error(struct bdi_writeback *wb)
 {
 #ifdef CONFIG_SMP
-	return nr_cpu_ids * BDI_STAT_BATCH;
+	return nr_cpu_ids * WB_STAT_BATCH;
 #else
 	return 1;
 #endif
@@ -231,50 +140,57 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
  * BDI_CAP_NO_WRITEBACK:   Don't write pages back
  * BDI_CAP_NO_ACCT_WB:     Don't automatically account writeback pages
  * BDI_CAP_STRICTLIMIT:    Keep number of dirty pages below bdi threshold.
+ *
+ * BDI_CAP_CGROUP_WRITEBACK: Supports cgroup-aware writeback.
  */
 #define BDI_CAP_NO_ACCT_DIRTY	0x00000001
 #define BDI_CAP_NO_WRITEBACK	0x00000002
 #define BDI_CAP_NO_ACCT_WB	0x00000004
 #define BDI_CAP_STABLE_WRITES	0x00000008
 #define BDI_CAP_STRICTLIMIT	0x00000010
+#define BDI_CAP_CGROUP_WRITEBACK 0x00000020
 
 #define BDI_CAP_NO_ACCT_AND_WRITEBACK \
 	(BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB)
 
 extern struct backing_dev_info noop_backing_dev_info;
 
-int writeback_in_progress(struct backing_dev_info *bdi);
-
-static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits)
+/**
+ * writeback_in_progress - determine whether there is writeback in progress
+ * @wb: bdi_writeback of interest
+ *
+ * Determine whether there is writeback waiting to be handled against a
+ * bdi_writeback.
+ */
+static inline bool writeback_in_progress(struct bdi_writeback *wb)
 {
-	if (bdi->congested_fn)
-		return bdi->congested_fn(bdi->congested_data, bdi_bits);
-	return (bdi->state & bdi_bits);
+	return test_bit(WB_writeback_running, &wb->state);
 }
 
-static inline int bdi_read_congested(struct backing_dev_info *bdi)
+static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
 {
-	return bdi_congested(bdi, 1 << BDI_sync_congested);
-}
+	struct super_block *sb;
 
-static inline int bdi_write_congested(struct backing_dev_info *bdi)
-{
-	return bdi_congested(bdi, 1 << BDI_async_congested);
+	if (!inode)
+		return &noop_backing_dev_info;
+
+	sb = inode->i_sb;
+#ifdef CONFIG_BLOCK
+	if (sb_is_blkdev_sb(sb))
+		return blk_get_backing_dev_info(I_BDEV(inode));
+#endif
+	return sb->s_bdi;
 }
 
-static inline int bdi_rw_congested(struct backing_dev_info *bdi)
+static inline int wb_congested(struct bdi_writeback *wb, int cong_bits)
 {
-	return bdi_congested(bdi, (1 << BDI_sync_congested) |
-				  (1 << BDI_async_congested));
-}
+	struct backing_dev_info *bdi = wb->bdi;
 
-enum {
-	BLK_RW_ASYNC	= 0,
-	BLK_RW_SYNC	= 1,
-};
+	if (bdi->congested_fn)
+		return bdi->congested_fn(bdi->congested_data, cong_bits);
+	return wb->congested->state & cong_bits;
+}
 
-void clear_bdi_congested(struct backing_dev_info *bdi, int sync);
-void set_bdi_congested(struct backing_dev_info *bdi, int sync);
 long congestion_wait(int sync, long timeout);
 long wait_iff_congested(struct zone *zone, int sync, long timeout);
 int pdflush_proc_obsolete(struct ctl_table *table, int write,
@@ -318,4 +234,333 @@ static inline int bdi_sched_wait(void *word)
 	return 0;
 }
 
-#endif		/* _LINUX_BACKING_DEV_H */
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+struct bdi_writeback_congested *
+wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp);
+void wb_congested_put(struct bdi_writeback_congested *congested);
+struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
+				    struct cgroup_subsys_state *memcg_css,
+				    gfp_t gfp);
+void wb_memcg_offline(struct mem_cgroup *memcg);
+void wb_blkcg_offline(struct blkcg *blkcg);
+int inode_congested(struct inode *inode, int cong_bits);
+
+/**
+ * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode
+ * @inode: inode of interest
+ *
+ * cgroup writeback requires support from both the bdi and filesystem.
+ * Test whether @inode has both.
+ */
+static inline bool inode_cgwb_enabled(struct inode *inode)
+{
+	struct backing_dev_info *bdi = inode_to_bdi(inode);
+
+	return bdi_cap_account_dirty(bdi) &&
+		(bdi->capabilities & BDI_CAP_CGROUP_WRITEBACK) &&
+		(inode->i_sb->s_type->fs_flags & FS_CGROUP_WRITEBACK);
+}
+
+/**
+ * wb_find_current - find wb for %current on a bdi
+ * @bdi: bdi of interest
+ *
+ * Find the wb of @bdi which matches both the memcg and blkcg of %current.
+ * Must be called under rcu_read_lock() which protects the returend wb.
+ * NULL if not found.
+ */
+static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
+{
+	struct cgroup_subsys_state *memcg_css;
+	struct bdi_writeback *wb;
+
+	memcg_css = task_css(current, memory_cgrp_id);
+	if (!memcg_css->parent)
+		return &bdi->wb;
+
+	wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
+
+	/*
+	 * %current's blkcg equals the effective blkcg of its memcg.  No
+	 * need to use the relatively expensive cgroup_get_e_css().
+	 */
+	if (likely(wb && wb->blkcg_css == task_css(current, blkio_cgrp_id)))
+		return wb;
+	return NULL;
+}
+
+/**
+ * wb_get_create_current - get or create wb for %current on a bdi
+ * @bdi: bdi of interest
+ * @gfp: allocation mask
+ *
+ * Equivalent to wb_get_create() on %current's memcg.  This function is
+ * called from a relatively hot path and optimizes the common cases using
+ * wb_find_current().
+ */
+static inline struct bdi_writeback *
+wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
+{
+	struct bdi_writeback *wb;
+
+	rcu_read_lock();
+	wb = wb_find_current(bdi);
+	if (wb && unlikely(!wb_tryget(wb)))
+		wb = NULL;
+	rcu_read_unlock();
+
+	if (unlikely(!wb)) {
+		struct cgroup_subsys_state *memcg_css;
+
+		memcg_css = task_get_css(current, memory_cgrp_id);
+		wb = wb_get_create(bdi, memcg_css, gfp);
+		css_put(memcg_css);
+	}
+	return wb;
+}
+
+/**
+ * inode_to_wb_is_valid - test whether an inode has a wb associated
+ * @inode: inode of interest
+ *
+ * Returns %true if @inode has a wb associated.  May be called without any
+ * locking.
+ */
+static inline bool inode_to_wb_is_valid(struct inode *inode)
+{
+	return inode->i_wb;
+}
+
+/**
+ * inode_to_wb - determine the wb of an inode
+ * @inode: inode of interest
+ *
+ * Returns the wb @inode is currently associated with.  The caller must be
+ * holding either @inode->i_lock, @inode->i_mapping->tree_lock, or the
+ * associated wb's list_lock.
+ */
+static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
+{
+#ifdef CONFIG_LOCKDEP
+	WARN_ON_ONCE(debug_locks &&
+		     (!lockdep_is_held(&inode->i_lock) &&
+		      !lockdep_is_held(&inode->i_mapping->tree_lock) &&
+		      !lockdep_is_held(&inode->i_wb->list_lock)));
+#endif
+	return inode->i_wb;
+}
+
+/**
+ * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction
+ * @inode: target inode
+ * @lockedp: temp bool output param, to be passed to the end function
+ *
+ * The caller wants to access the wb associated with @inode but isn't
+ * holding inode->i_lock, mapping->tree_lock or wb->list_lock.  This
+ * function determines the wb associated with @inode and ensures that the
+ * association doesn't change until the transaction is finished with
+ * unlocked_inode_to_wb_end().
+ *
+ * The caller must call unlocked_inode_to_wb_end() with *@lockdep
+ * afterwards and can't sleep during transaction.  IRQ may or may not be
+ * disabled on return.
+ */
+static inline struct bdi_writeback *
+unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
+{
+	rcu_read_lock();
+
+	/*
+	 * Paired with store_release in inode_switch_wb_work_fn() and
+	 * ensures that we see the new wb if we see cleared I_WB_SWITCH.
+	 */
+	*lockedp = smp_load_acquire(&inode->i_state) & I_WB_SWITCH;
+
+	if (unlikely(*lockedp))
+		spin_lock_irq(&inode->i_mapping->tree_lock);
+
+	/*
+	 * Protected by either !I_WB_SWITCH + rcu_read_lock() or tree_lock.
+	 * inode_to_wb() will bark.  Deref directly.
+	 */
+	return inode->i_wb;
+}
+
+/**
+ * unlocked_inode_to_wb_end - end inode wb access transaction
+ * @inode: target inode
+ * @locked: *@lockedp from unlocked_inode_to_wb_begin()
+ */
+static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
+{
+	if (unlikely(locked))
+		spin_unlock_irq(&inode->i_mapping->tree_lock);
+
+	rcu_read_unlock();
+}
+
+struct wb_iter {
+	int			start_blkcg_id;
+	struct radix_tree_iter	tree_iter;
+	void			**slot;
+};
+
+static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
+						   struct backing_dev_info *bdi)
+{
+	struct radix_tree_iter *titer = &iter->tree_iter;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	if (iter->start_blkcg_id >= 0) {
+		iter->slot = radix_tree_iter_init(titer, iter->start_blkcg_id);
+		iter->start_blkcg_id = -1;
+	} else {
+		iter->slot = radix_tree_next_slot(iter->slot, titer, 0);
+	}
+
+	if (!iter->slot)
+		iter->slot = radix_tree_next_chunk(&bdi->cgwb_tree, titer, 0);
+	if (iter->slot)
+		return *iter->slot;
+	return NULL;
+}
+
+static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter,
+						   struct backing_dev_info *bdi,
+						   int start_blkcg_id)
+{
+	iter->start_blkcg_id = start_blkcg_id;
+
+	if (start_blkcg_id)
+		return __wb_iter_next(iter, bdi);
+	else
+		return &bdi->wb;
+}
+
+/**
+ * bdi_for_each_wb - walk all wb's of a bdi in ascending blkcg ID order
+ * @wb_cur: cursor struct bdi_writeback pointer
+ * @bdi: bdi to walk wb's of
+ * @iter: pointer to struct wb_iter to be used as iteration buffer
+ * @start_blkcg_id: blkcg ID to start iteration from
+ *
+ * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending
+ * blkcg ID order starting from @start_blkcg_id.  @iter is struct wb_iter
+ * to be used as temp storage during iteration.  rcu_read_lock() must be
+ * held throughout iteration.
+ */
+#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id)		\
+	for ((wb_cur) = __wb_iter_init(iter, bdi, start_blkcg_id);	\
+	     (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi))
+
+#else	/* CONFIG_CGROUP_WRITEBACK */
+
+static inline bool inode_cgwb_enabled(struct inode *inode)
+{
+	return false;
+}
+
+static inline struct bdi_writeback_congested *
+wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp)
+{
+	return bdi->wb.congested;
+}
+
+static inline void wb_congested_put(struct bdi_writeback_congested *congested)
+{
+}
+
+static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
+{
+	return &bdi->wb;
+}
+
+static inline struct bdi_writeback *
+wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
+{
+	return &bdi->wb;
+}
+
+static inline bool inode_to_wb_is_valid(struct inode *inode)
+{
+	return true;
+}
+
+static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
+{
+	return &inode_to_bdi(inode)->wb;
+}
+
+static inline struct bdi_writeback *
+unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
+{
+	return inode_to_wb(inode);
+}
+
+static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
+{
+}
+
+static inline void wb_memcg_offline(struct mem_cgroup *memcg)
+{
+}
+
+static inline void wb_blkcg_offline(struct blkcg *blkcg)
+{
+}
+
+struct wb_iter {
+	int		next_id;
+};
+
+#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id)		\
+	for ((iter)->next_id = (start_blkcg_id);			\
+	     ({	(wb_cur) = !(iter)->next_id++ ? &(bdi)->wb : NULL; }); )
+
+static inline int inode_congested(struct inode *inode, int cong_bits)
+{
+	return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
+}
+
+#endif	/* CONFIG_CGROUP_WRITEBACK */
+
+static inline int inode_read_congested(struct inode *inode)
+{
+	return inode_congested(inode, 1 << WB_sync_congested);
+}
+
+static inline int inode_write_congested(struct inode *inode)
+{
+	return inode_congested(inode, 1 << WB_async_congested);
+}
+
+static inline int inode_rw_congested(struct inode *inode)
+{
+	return inode_congested(inode, (1 << WB_sync_congested) |
+				      (1 << WB_async_congested));
+}
+
+static inline int bdi_congested(struct backing_dev_info *bdi, int cong_bits)
+{
+	return wb_congested(&bdi->wb, cong_bits);
+}
+
+static inline int bdi_read_congested(struct backing_dev_info *bdi)
+{
+	return bdi_congested(bdi, 1 << WB_sync_congested);
+}
+
+static inline int bdi_write_congested(struct backing_dev_info *bdi)
+{
+	return bdi_congested(bdi, 1 << WB_async_congested);
+}
+
+static inline int bdi_rw_congested(struct backing_dev_info *bdi)
+{
+	return bdi_congested(bdi, (1 << WB_sync_congested) |
+				  (1 << WB_async_congested));
+}
+
+#endif	/* _LINUX_BACKING_DEV_H */
diff --git a/include/linux/bio.h b/include/linux/bio.h
index da3a127c99583b..5e963a6d7c1427 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -290,7 +290,21 @@ static inline unsigned bio_segments(struct bio *bio)
  * returns. and then bio would be freed memory when if (bio->bi_flags ...)
  * runs
  */
-#define bio_get(bio)	atomic_inc(&(bio)->bi_cnt)
+static inline void bio_get(struct bio *bio)
+{
+	bio->bi_flags |= (1 << BIO_REFFED);
+	smp_mb__before_atomic();
+	atomic_inc(&bio->__bi_cnt);
+}
+
+static inline void bio_cnt_set(struct bio *bio, unsigned int count)
+{
+	if (count != 1) {
+		bio->bi_flags |= (1 << BIO_REFFED);
+		smp_mb__before_atomic();
+	}
+	atomic_set(&bio->__bi_cnt, count);
+}
 
 enum bip_flags {
 	BIP_BLOCK_INTEGRITY	= 1 << 0, /* block layer owns integrity data */
@@ -413,7 +427,6 @@ static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask)
 }
 
 extern void bio_endio(struct bio *, int);
-extern void bio_endio_nodec(struct bio *, int);
 struct request_queue;
 extern int bio_phys_segments(struct request_queue *, struct bio *);
 
@@ -469,9 +482,12 @@ extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int);
 extern unsigned int bvec_nr_vecs(unsigned short idx);
 
 #ifdef CONFIG_BLK_CGROUP
+int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css);
 int bio_associate_current(struct bio *bio);
 void bio_disassociate_task(struct bio *bio);
 #else	/* CONFIG_BLK_CGROUP */
+static inline int bio_associate_blkcg(struct bio *bio,
+			struct cgroup_subsys_state *blkcg_css) { return 0; }
 static inline int bio_associate_current(struct bio *bio) { return -ENOENT; }
 static inline void bio_disassociate_task(struct bio *bio) { }
 #endif	/* CONFIG_BLK_CGROUP */
diff --git a/block/blk-cgroup.h b/include/linux/blk-cgroup.h
index c567865b5f1df6..07a32b813ed897 100644
--- a/block/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -53,6 +53,10 @@ struct blkcg {
 	/* TODO: per-policy storage in blkcg */
 	unsigned int			cfq_weight;	/* belongs to cfq */
 	unsigned int			cfq_leaf_weight;
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+	struct list_head		cgwb_list;
+#endif
 };
 
 struct blkg_stat {
@@ -95,6 +99,12 @@ struct blkcg_gq {
 	struct hlist_node		blkcg_node;
 	struct blkcg			*blkcg;
 
+	/*
+	 * Each blkg gets congested separately and the congestion state is
+	 * propagated to the matching bdi_writeback_congested.
+	 */
+	struct bdi_writeback_congested	*wb_congested;
+
 	/* all non-root blkcg_gq's are guaranteed to have access to parent */
 	struct blkcg_gq			*parent;
 
@@ -134,6 +144,7 @@ struct blkcg_policy {
 };
 
 extern struct blkcg blkcg_root;
+extern struct cgroup_subsys_state * const blkcg_root_css;
 
 struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q);
 struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
@@ -194,6 +205,12 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
 	return task_blkcg(current);
 }
 
+static inline struct cgroup_subsys_state *
+task_get_blkcg_css(struct task_struct *task)
+{
+	return task_get_css(task, blkio_cgrp_id);
+}
+
 /**
  * blkcg_parent - get the parent of a blkcg
  * @blkcg: blkcg of interest
@@ -558,8 +575,8 @@ static inline void blkg_rwstat_merge(struct blkg_rwstat *to,
 
 #else	/* CONFIG_BLK_CGROUP */
 
-struct cgroup;
-struct blkcg;
+struct blkcg {
+};
 
 struct blkg_policy_data {
 };
@@ -570,6 +587,16 @@ struct blkcg_gq {
 struct blkcg_policy {
 };
 
+#define blkcg_root_css	((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
+
+static inline struct cgroup_subsys_state *
+task_get_blkcg_css(struct task_struct *task)
+{
+	return NULL;
+}
+
+#ifdef CONFIG_BLOCK
+
 static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
 static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
 static inline void blkcg_drain_queue(struct request_queue *q) { }
@@ -599,5 +626,6 @@ static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q
 #define blk_queue_for_each_rl(rl, q)	\
 	for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
 
+#endif	/* CONFIG_BLOCK */
 #endif	/* CONFIG_BLK_CGROUP */
 #endif	/* _BLK_CGROUP_H */
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 2056a99b92f8c9..37d1602c4f7aa0 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -96,6 +96,7 @@ typedef void (exit_request_fn)(void *, struct request *, unsigned int,
 
 typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *,
 		bool);
+typedef void (busy_tag_iter_fn)(struct request *, void *, bool);
 
 struct blk_mq_ops {
 	/*
@@ -182,6 +183,7 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
 struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
 		gfp_t gfp, bool reserved);
 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
+struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags);
 
 enum {
 	BLK_MQ_UNIQUE_TAG_BITS = 16,
@@ -224,6 +226,8 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async);
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
 void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn,
 		void *priv);
+void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
+		void *priv);
 void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_unfreeze_queue(struct request_queue *q);
 void blk_mq_freeze_queue_start(struct request_queue *q);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index b7299febc4b4ad..6ab9d12d1f1776 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -65,7 +65,7 @@ struct bio {
 	unsigned int		bi_seg_front_size;
 	unsigned int		bi_seg_back_size;
 
-	atomic_t		bi_remaining;
+	atomic_t		__bi_remaining;
 
 	bio_end_io_t		*bi_end_io;
 
@@ -92,7 +92,7 @@ struct bio {
 
 	unsigned short		bi_max_vecs;	/* max bvl_vecs we can hold */
 
-	atomic_t		bi_cnt;		/* pin count */
+	atomic_t		__bi_cnt;	/* pin count */
 
 	struct bio_vec		*bi_io_vec;	/* the actual vec list */
 
@@ -112,16 +112,15 @@ struct bio {
  * bio flags
  */
 #define BIO_UPTODATE	0	/* ok after I/O completion */
-#define BIO_RW_BLOCK	1	/* RW_AHEAD set, and read/write would block */
-#define BIO_EOF		2	/* out-out-bounds error */
-#define BIO_SEG_VALID	3	/* bi_phys_segments valid */
-#define BIO_CLONED	4	/* doesn't own data */
-#define BIO_BOUNCED	5	/* bio is a bounce bio */
-#define BIO_USER_MAPPED 6	/* contains user pages */
-#define BIO_EOPNOTSUPP	7	/* not supported */
-#define BIO_NULL_MAPPED 8	/* contains invalid user pages */
-#define BIO_QUIET	9	/* Make BIO Quiet */
-#define BIO_SNAP_STABLE	10	/* bio data must be snapshotted during write */
+#define BIO_SEG_VALID	1	/* bi_phys_segments valid */
+#define BIO_CLONED	2	/* doesn't own data */
+#define BIO_BOUNCED	3	/* bio is a bounce bio */
+#define BIO_USER_MAPPED 4	/* contains user pages */
+#define BIO_NULL_MAPPED 5	/* contains invalid user pages */
+#define BIO_QUIET	6	/* Make BIO Quiet */
+#define BIO_SNAP_STABLE	7	/* bio data must be snapshotted during write */
+#define BIO_CHAIN	8	/* chained bio, ->bi_remaining in effect */
+#define BIO_REFFED	9	/* bio has elevated ->bi_cnt */
 
 /*
  * Flags starting here get preserved by bio_reset() - this includes
@@ -193,6 +192,7 @@ enum rq_flag_bits {
 	__REQ_HASHED,		/* on IO scheduler merge hash */
 	__REQ_MQ_INFLIGHT,	/* track inflight for MQ */
 	__REQ_NO_TIMEOUT,	/* requests may never expire */
+	__REQ_CLONE,		/* cloned bios */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -247,5 +247,6 @@ enum rq_flag_bits {
 #define REQ_HASHED		(1ULL << __REQ_HASHED)
 #define REQ_MQ_INFLIGHT		(1ULL << __REQ_MQ_INFLIGHT)
 #define REQ_NO_TIMEOUT		(1ULL << __REQ_NO_TIMEOUT)
+#define REQ_CLONE		(1ULL << __REQ_CLONE)
 
 #endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5d93a6645e8867..c9a2a0c722ca65 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -12,7 +12,7 @@
 #include <linux/timer.h>
 #include <linux/workqueue.h>
 #include <linux/pagemap.h>
-#include <linux/backing-dev.h>
+#include <linux/backing-dev-defs.h>
 #include <linux/wait.h>
 #include <linux/mempool.h>
 #include <linux/bio.h>
@@ -22,15 +22,13 @@
 #include <linux/smp.h>
 #include <linux/rcupdate.h>
 #include <linux/percpu-refcount.h>
-
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 
 struct module;
 struct scsi_ioctl_command;
 
 struct request_queue;
 struct elevator_queue;
-struct request_pm_state;
 struct blk_trace;
 struct request;
 struct sg_io_hdr;
@@ -75,18 +73,7 @@ struct request_list {
 enum rq_cmd_type_bits {
 	REQ_TYPE_FS		= 1,	/* fs request */
 	REQ_TYPE_BLOCK_PC,		/* scsi command */
-	REQ_TYPE_SENSE,			/* sense request */
-	REQ_TYPE_PM_SUSPEND,		/* suspend request */
-	REQ_TYPE_PM_RESUME,		/* resume request */
-	REQ_TYPE_PM_SHUTDOWN,		/* shutdown request */
-	REQ_TYPE_SPECIAL,		/* driver defined type */
-	/*
-	 * for ATA/ATAPI devices. this really doesn't belong here, ide should
-	 * use REQ_TYPE_SPECIAL and use rq->cmd[0] with the range of driver
-	 * private REQ_LB opcodes to differentiate what type of request this is
-	 */
-	REQ_TYPE_ATA_TASKFILE,
-	REQ_TYPE_ATA_PC,
+	REQ_TYPE_DRV_PRIV,		/* driver defined types from here */
 };
 
 #define BLK_MAX_CDB	16
@@ -108,7 +95,7 @@ struct request {
 	struct blk_mq_ctx *mq_ctx;
 
 	u64 cmd_flags;
-	enum rq_cmd_type_bits cmd_type;
+	unsigned cmd_type;
 	unsigned long atomic_flags;
 
 	int cpu;
@@ -216,19 +203,6 @@ static inline unsigned short req_get_ioprio(struct request *req)
 	return req->ioprio;
 }
 
-/*
- * State information carried for REQ_TYPE_PM_SUSPEND and REQ_TYPE_PM_RESUME
- * requests. Some step values could eventually be made generic.
- */
-struct request_pm_state
-{
-	/* PM state machine step value, currently driver specific */
-	int	pm_step;
-	/* requested PM state value (S1, S2, S3, S4, ...) */
-	u32	pm_state;
-	void*	data;		/* for driver use */
-};
-
 #include <linux/elevator.h>
 
 struct blk_queue_ctx;
@@ -469,7 +443,7 @@ struct request_queue {
 	struct mutex		sysfs_lock;
 
 	int			bypass_depth;
-	int			mq_freeze_depth;
+	atomic_t		mq_freeze_depth;
 
 #if defined(CONFIG_BLK_DEV_BSG)
 	bsg_job_fn		*bsg_job_fn;
@@ -610,10 +584,6 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 	(((rq)->cmd_flags & REQ_STARTED) && \
 	 ((rq)->cmd_type == REQ_TYPE_FS))
 
-#define blk_pm_request(rq)	\
-	((rq)->cmd_type == REQ_TYPE_PM_SUSPEND || \
-	 (rq)->cmd_type == REQ_TYPE_PM_RESUME)
-
 #define blk_rq_cpu_valid(rq)	((rq)->cpu != -1)
 #define blk_bidi_rq(rq)		((rq)->next_rq != NULL)
 /* rq->queuelist of dequeued request must be list_empty() */
@@ -804,11 +774,7 @@ extern void blk_add_request_payload(struct request *rq, struct page *page,
 		unsigned int len);
 extern int blk_rq_check_limits(struct request_queue *q, struct request *rq);
 extern int blk_lld_busy(struct request_queue *q);
-extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
-			     struct bio_set *bs, gfp_t gfp_mask,
-			     int (*bio_ctr)(struct bio *, struct bio *, void *),
-			     void *data);
-extern void blk_rq_unprep_clone(struct request *rq);
+extern void blk_rq_prep_clone(struct request *rq, struct request *rq_src);
 extern int blk_insert_cloned_request(struct request_queue *q,
 				     struct request *rq);
 extern void blk_delay_queue(struct request_queue *, unsigned long);
@@ -821,30 +787,12 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 			 struct scsi_ioctl_command __user *);
 
-/*
- * A queue has just exitted congestion.  Note this in the global counter of
- * congested queues, and wake up anyone who was waiting for requests to be
- * put back.
- */
-static inline void blk_clear_queue_congested(struct request_queue *q, int sync)
-{
-	clear_bdi_congested(&q->backing_dev_info, sync);
-}
-
-/*
- * A queue has just entered congestion.  Flag that in the queue's VM-visible
- * state flags and increment the global gounter of congested queues.
- */
-static inline void blk_set_queue_congested(struct request_queue *q, int sync)
-{
-	set_bdi_congested(&q->backing_dev_info, sync);
-}
-
 extern void blk_start_queue(struct request_queue *q);
 extern void blk_stop_queue(struct request_queue *q);
 extern void blk_sync_queue(struct request_queue *q);
 extern void __blk_stop_queue(struct request_queue *q);
 extern void __blk_run_queue(struct request_queue *q);
+extern void __blk_run_queue_uncond(struct request_queue *q);
 extern void blk_run_queue(struct request_queue *);
 extern void blk_run_queue_async(struct request_queue *q);
 extern int blk_rq_map_user(struct request_queue *, struct request *,
@@ -933,7 +881,7 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq)
 	if (unlikely(rq->cmd_type == REQ_TYPE_BLOCK_PC))
 		return q->limits.max_hw_sectors;
 
-	if (!q->limits.chunk_sectors)
+	if (!q->limits.chunk_sectors || (rq->cmd_flags & REQ_DISCARD))
 		return blk_queue_get_max_sectors(q, rq->cmd_flags);
 
 	return min(blk_max_size_offset(q, blk_rq_pos(rq)),
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b9cb94c3102a40..e7da0aa65b2d7a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -774,6 +774,31 @@ static inline struct cgroup_subsys_state *task_css(struct task_struct *task,
 }
 
 /**
+ * task_get_css - find and get the css for (task, subsys)
+ * @task: the target task
+ * @subsys_id: the target subsystem ID
+ *
+ * Find the css for the (@task, @subsys_id) combination, increment a
+ * reference on and return it.  This function is guaranteed to return a
+ * valid css.
+ */
+static inline struct cgroup_subsys_state *
+task_get_css(struct task_struct *task, int subsys_id)
+{
+	struct cgroup_subsys_state *css;
+
+	rcu_read_lock();
+	while (true) {
+		css = task_css(task, subsys_id);
+		if (likely(css_tryget_online(css)))
+			break;
+		cpu_relax();
+	}
+	rcu_read_unlock();
+	return css;
+}
+
+/**
  * task_css_is_root - test whether a task belongs to the root css
  * @task: the target task
  * @subsys_id: the target subsystem ID
diff --git a/include/linux/dmapool.h b/include/linux/dmapool.h
index 52456aa566a05e..e1043f79122f82 100644
--- a/include/linux/dmapool.h
+++ b/include/linux/dmapool.h
@@ -11,8 +11,8 @@
 #ifndef LINUX_DMAPOOL_H
 #define	LINUX_DMAPOOL_H
 
+#include <linux/scatterlist.h>
 #include <asm/io.h>
-#include <asm/scatterlist.h>
 
 struct device;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c726591f2c5056..a14655434366ac 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -35,6 +35,7 @@
 #include <uapi/linux/fs.h>
 
 struct backing_dev_info;
+struct bdi_writeback;
 struct export_operations;
 struct hd_geometry;
 struct iovec;
@@ -635,6 +636,14 @@ struct inode {
 
 	struct hlist_node	i_hash;
 	struct list_head	i_wb_list;	/* backing dev IO list */
+#ifdef CONFIG_CGROUP_WRITEBACK
+	struct bdi_writeback	*i_wb;		/* the associated cgroup wb */
+
+	/* foreign inode detection, see wbc_detach_inode() */
+	int			i_wb_frn_winner;
+	u16			i_wb_frn_avg_time;
+	u16			i_wb_frn_history;
+#endif
 	struct list_head	i_lru;		/* inode LRU list */
 	struct list_head	i_sb_list;
 	union {
@@ -1807,6 +1816,11 @@ struct super_operations {
  *
  * I_DIO_WAKEUP		Never set.  Only used as a key for wait_on_bit().
  *
+ * I_WB_SWITCH		Cgroup bdi_writeback switching in progress.  Used to
+ *			synchronize competing switching instances and to tell
+ *			wb stat updates to grab mapping->tree_lock.  See
+ *			inode_switch_wb_work_fn() for details.
+ *
  * Q: What is the difference between I_WILL_FREE and I_FREEING?
  */
 #define I_DIRTY_SYNC		(1 << 0)
@@ -1826,6 +1840,7 @@ struct super_operations {
 #define I_DIRTY_TIME		(1 << 11)
 #define __I_DIRTY_TIME_EXPIRED	12
 #define I_DIRTY_TIME_EXPIRED	(1 << __I_DIRTY_TIME_EXPIRED)
+#define I_WB_SWITCH		(1 << 13)
 
 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
 #define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)
@@ -1899,6 +1914,7 @@ struct file_system_type {
 #define FS_HAS_SUBTYPE		4
 #define FS_USERNS_MOUNT		8	/* Can be mounted by userns root */
 #define FS_USERNS_DEV_MOUNT	16 /* A userns mount does not imply MNT_NODEV */
+#define FS_CGROUP_WRITEBACK	32	/* Supports cgroup-aware writeback */
 #define FS_RENAME_DOES_D_MOVE	32768	/* FS will handle d_move() during rename() internally. */
 	struct dentry *(*mount) (struct file_system_type *, int,
 		       const char *, void *);
@@ -2242,7 +2258,13 @@ extern struct super_block *freeze_bdev(struct block_device *);
 extern void emergency_thaw_all(void);
 extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
 extern int fsync_bdev(struct block_device *);
-extern int sb_is_blkdev_sb(struct super_block *sb);
+
+extern struct super_block *blockdev_superblock;
+
+static inline bool sb_is_blkdev_sb(struct super_block *sb)
+{
+	return sb == blockdev_superblock;
+}
 #else
 static inline void bd_forget(struct inode *inode) {}
 static inline int sync_blockdev(struct block_device *bdev) { return 0; }
@@ -2281,6 +2303,9 @@ extern struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
 extern struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode,
 					      void *holder);
 extern void blkdev_put(struct block_device *bdev, fmode_t mode);
+extern int __blkdev_reread_part(struct block_device *bdev);
+extern int blkdev_reread_part(struct block_device *bdev);
+
 #ifdef CONFIG_SYSFS
 extern int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk);
 extern void bd_unlink_disk_holder(struct block_device *bdev,
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 93b5ca754b5b4c..a633898f36ac83 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -39,6 +39,19 @@
 
 struct device;
 
+/* IDE-specific values for req->cmd_type */
+enum ata_cmd_type_bits {
+	REQ_TYPE_ATA_TASKFILE = REQ_TYPE_DRV_PRIV + 1,
+	REQ_TYPE_ATA_PC,
+	REQ_TYPE_ATA_SENSE,	/* sense request */
+	REQ_TYPE_ATA_PM_SUSPEND,/* suspend request */
+	REQ_TYPE_ATA_PM_RESUME,	/* resume request */
+};
+
+#define ata_pm_request(rq)	\
+	((rq)->cmd_type == REQ_TYPE_ATA_PM_SUSPEND || \
+	 (rq)->cmd_type == REQ_TYPE_ATA_PM_RESUME)
+
 /* Error codes returned in rq->errors to the higher part of the driver. */
 enum {
 	IDE_DRV_ERROR_GENERAL	= 101,
@@ -1314,6 +1327,19 @@ struct ide_port_info {
 	u8			udma_mask;
 };
 
+/*
+ * State information carried for REQ_TYPE_ATA_PM_SUSPEND and REQ_TYPE_ATA_PM_RESUME
+ * requests.
+ */
+struct ide_pm_state {
+	/* PM state machine step value, currently driver specific */
+	int	pm_step;
+	/* requested PM state value (S1, S2, S3, S4, ...) */
+	u32	pm_state;
+	void*	data;		/* for driver use */
+};
+
+
 int ide_pci_init_one(struct pci_dev *, const struct ide_port_info *, void *);
 int ide_pci_init_two(struct pci_dev *, struct pci_dev *,
 		     const struct ide_port_info *, void *);
@@ -1551,4 +1577,5 @@ static inline void ide_set_drivedata(ide_drive_t *drive, void *data)
 #define ide_host_for_each_port(i, port, host) \
 	for ((i) = 0; ((port) = (host)->ports[i]) || (i) < MAX_HOST_PORTS; (i)++)
 
+
 #endif /* _IDE_H */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6c8918114804fd..73b02b0a8f609a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -41,6 +41,7 @@ enum mem_cgroup_stat_index {
 	MEM_CGROUP_STAT_RSS,		/* # of pages charged as anon rss */
 	MEM_CGROUP_STAT_RSS_HUGE,	/* # of pages charged as anon huge */
 	MEM_CGROUP_STAT_FILE_MAPPED,	/* # of pages charged as file rss */
+	MEM_CGROUP_STAT_DIRTY,          /* # of dirty pages in page cache */
 	MEM_CGROUP_STAT_WRITEBACK,	/* # of pages under writeback */
 	MEM_CGROUP_STAT_SWAP,		/* # of pages, swapped out */
 	MEM_CGROUP_STAT_NSTATS,
@@ -67,6 +68,8 @@ enum mem_cgroup_events_index {
 };
 
 #ifdef CONFIG_MEMCG
+extern struct cgroup_subsys_state *mem_cgroup_root_css;
+
 void mem_cgroup_events(struct mem_cgroup *memcg,
 		       enum mem_cgroup_events_index idx,
 		       unsigned int nr);
@@ -112,6 +115,7 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
 }
 
 extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg);
+extern struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
 
 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
 				   struct mem_cgroup *,
@@ -195,6 +199,8 @@ void mem_cgroup_split_huge_fixup(struct page *head);
 #else /* CONFIG_MEMCG */
 struct mem_cgroup;
 
+#define mem_cgroup_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
+
 static inline void mem_cgroup_events(struct mem_cgroup *memcg,
 				     enum mem_cgroup_events_index idx,
 				     unsigned int nr)
@@ -382,6 +388,29 @@ enum {
 	OVER_LIMIT,
 };
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg);
+struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
+void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
+			 unsigned long *pdirty, unsigned long *pwriteback);
+
+#else	/* CONFIG_CGROUP_WRITEBACK */
+
+static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
+{
+	return NULL;
+}
+
+static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
+				       unsigned long *pavail,
+				       unsigned long *pdirty,
+				       unsigned long *pwriteback)
+{
+}
+
+#endif	/* CONFIG_CGROUP_WRITEBACK */
+
 struct sock;
 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
 void sock_update_memcg(struct sock *sk);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0755b9fd03a7d9..4024543b4203eb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -27,6 +27,7 @@ struct anon_vma_chain;
 struct file_ra_state;
 struct user_struct;
 struct writeback_control;
+struct bdi_writeback;
 
 #ifndef CONFIG_NEED_MULTIPLE_NODES	/* Don't use mapnrs, do it properly */
 extern unsigned long max_mapnr;
@@ -1211,10 +1212,13 @@ int __set_page_dirty_nobuffers(struct page *page);
 int __set_page_dirty_no_writeback(struct page *page);
 int redirty_page_for_writepage(struct writeback_control *wbc,
 				struct page *page);
-void account_page_dirtied(struct page *page, struct address_space *mapping);
-void account_page_cleaned(struct page *page, struct address_space *mapping);
+void account_page_dirtied(struct page *page, struct address_space *mapping,
+			  struct mem_cgroup *memcg);
+void account_page_cleaned(struct page *page, struct address_space *mapping,
+			  struct mem_cgroup *memcg, struct bdi_writeback *wb);
 int set_page_dirty(struct page *page);
 int set_page_dirty_lock(struct page *page);
+void cancel_dirty_page(struct page *page);
 int clear_page_dirty_for_io(struct page *page);
 
 int get_cmdline(struct task_struct *task, char *buffer, int buflen);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 8dbd05e70f095b..986bf8ad8e93da 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -74,7 +74,7 @@ struct nvme_dev {
 	struct blk_mq_tag_set tagset;
 	struct blk_mq_tag_set admin_tagset;
 	u32 __iomem *dbs;
-	struct pci_dev *pci_dev;
+	struct device *dev;
 	struct dma_pool *prp_page_pool;
 	struct dma_pool *prp_small_pool;
 	int instance;
@@ -146,25 +146,15 @@ static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector)
 	return (sector >> (ns->lba_shift - 9));
 }
 
-/**
- * nvme_free_iod - frees an nvme_iod
- * @dev: The device that the I/O was submitted to
- * @iod: The memory to free
- */
-void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod);
-
-int nvme_setup_prps(struct nvme_dev *, struct nvme_iod *, int, gfp_t);
-struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
-				unsigned long addr, unsigned length);
-void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
-			struct nvme_iod *iod);
-int nvme_submit_io_cmd(struct nvme_dev *, struct nvme_ns *,
-						struct nvme_command *, u32 *);
-int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns);
-int nvme_submit_admin_cmd(struct nvme_dev *, struct nvme_command *,
-							u32 *result);
-int nvme_identify(struct nvme_dev *, unsigned nsid, unsigned cns,
-							dma_addr_t dma_addr);
+int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
+		void *buf, unsigned bufflen);
+int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
+		void *buffer, void __user *ubuffer, unsigned bufflen,
+		u32 *result, unsigned timeout);
+int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id);
+int nvme_identify_ns(struct nvme_dev *dev, unsigned nsid,
+		struct nvme_id_ns **id);
+int nvme_get_log_page(struct nvme_dev *dev, struct nvme_smart_log **log);
 int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid,
 			dma_addr_t dma_addr, u32 *result);
 int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 4b3736f7065c49..fb0814ca65c732 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -651,7 +651,8 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 				pgoff_t index, gfp_t gfp_mask);
 extern void delete_from_page_cache(struct page *page);
-extern void __delete_from_page_cache(struct page *page, void *shadow);
+extern void __delete_from_page_cache(struct page *page, void *shadow,
+				     struct mem_cgroup *memcg);
 int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
 
 /*
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index a0edb992c9c31a..50a8486c524bb2 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -2,13 +2,39 @@
 #define _LINUX_SCATTERLIST_H
 
 #include <linux/string.h>
+#include <linux/types.h>
 #include <linux/bug.h>
 #include <linux/mm.h>
-
-#include <asm/types.h>
-#include <asm/scatterlist.h>
 #include <asm/io.h>
 
+struct scatterlist {
+#ifdef CONFIG_DEBUG_SG
+	unsigned long	sg_magic;
+#endif
+	unsigned long	page_link;
+	unsigned int	offset;
+	unsigned int	length;
+	dma_addr_t	dma_address;
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
+	unsigned int	dma_length;
+#endif
+};
+
+/*
+ * These macros should be used after a dma_map_sg call has been done
+ * to get bus addresses of each of the SG entries and their lengths.
+ * You should only work with the number of sg entries dma_map_sg
+ * returns, or alternatively stop on the first sg_dma_len(sg) which
+ * is 0.
+ */
+#define sg_dma_address(sg)	((sg)->dma_address)
+
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
+#define sg_dma_len(sg)		((sg)->dma_length)
+#else
+#define sg_dma_len(sg)		((sg)->length)
+#endif
+
 struct sg_table {
 	struct scatterlist *sgl;	/* the list */
 	unsigned int nents;		/* number of mapped entries */
@@ -18,10 +44,9 @@ struct sg_table {
 /*
  * Notes on SG table design.
  *
- * Architectures must provide an unsigned long page_link field in the
- * scatterlist struct. We use that to place the page pointer AND encode
- * information about the sg table as well. The two lower bits are reserved
- * for this information.
+ * We use the unsigned long page_link field in the scatterlist struct to place
+ * the page pointer AND encode information about the sg table as well. The two
+ * lower bits are reserved for this information.
  *
  * If bit 0 is set, then the page_link contains a pointer to the next sg
  * table list. Otherwise the next entry is at sg + 1.
diff --git a/include/linux/swap.h b/include/linux/swap.h
index cee108cbe2d52e..38874729dc5fd8 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -377,7 +377,6 @@ extern void end_swap_bio_write(struct bio *bio, int err);
 extern int __swap_writepage(struct page *page, struct writeback_control *wbc,
 	void (*end_write_func)(struct bio *, int));
 extern int swap_set_page_dirty(struct page *page);
-extern void end_swap_bio_read(struct bio *bio, int err);
 
 int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
 		unsigned long nr_pages, sector_t start_block);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index b2dd371ec0ca0a..b333c945e57117 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -7,6 +7,8 @@
 #include <linux/sched.h>
 #include <linux/workqueue.h>
 #include <linux/fs.h>
+#include <linux/flex_proportions.h>
+#include <linux/backing-dev-defs.h>
 
 DECLARE_PER_CPU(int, dirty_throttle_leaks);
 
@@ -84,18 +86,95 @@ struct writeback_control {
 	unsigned for_reclaim:1;		/* Invoked from the page allocator */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
 	unsigned for_sync:1;		/* sync(2) WB_SYNC_ALL writeback */
+#ifdef CONFIG_CGROUP_WRITEBACK
+	struct bdi_writeback *wb;	/* wb this writeback is issued under */
+	struct inode *inode;		/* inode being written out */
+
+	/* foreign inode detection, see wbc_detach_inode() */
+	int wb_id;			/* current wb id */
+	int wb_lcand_id;		/* last foreign candidate wb id */
+	int wb_tcand_id;		/* this foreign candidate wb id */
+	size_t wb_bytes;		/* bytes written by current wb */
+	size_t wb_lcand_bytes;		/* bytes written by last candidate */
+	size_t wb_tcand_bytes;		/* bytes written by this candidate */
+#endif
 };
 
 /*
+ * A wb_domain represents a domain that wb's (bdi_writeback's) belong to
+ * and are measured against each other in.  There always is one global
+ * domain, global_wb_domain, that every wb in the system is a member of.
+ * This allows measuring the relative bandwidth of each wb to distribute
+ * dirtyable memory accordingly.
+ */
+struct wb_domain {
+	spinlock_t lock;
+
+	/*
+	 * Scale the writeback cache size proportional to the relative
+	 * writeout speed.
+	 *
+	 * We do this by keeping a floating proportion between BDIs, based
+	 * on page writeback completions [end_page_writeback()]. Those
+	 * devices that write out pages fastest will get the larger share,
+	 * while the slower will get a smaller share.
+	 *
+	 * We use page writeout completions because we are interested in
+	 * getting rid of dirty pages. Having them written out is the
+	 * primary goal.
+	 *
+	 * We introduce a concept of time, a period over which we measure
+	 * these events, because demand can/will vary over time. The length
+	 * of this period itself is measured in page writeback completions.
+	 */
+	struct fprop_global completions;
+	struct timer_list period_timer;	/* timer for aging of completions */
+	unsigned long period_time;
+
+	/*
+	 * The dirtyable memory and dirty threshold could be suddenly
+	 * knocked down by a large amount (eg. on the startup of KVM in a
+	 * swapless system). This may throw the system into deep dirty
+	 * exceeded state and throttle heavy/light dirtiers alike. To
+	 * retain good responsiveness, maintain global_dirty_limit for
+	 * tracking slowly down to the knocked down dirty threshold.
+	 *
+	 * Both fields are protected by ->lock.
+	 */
+	unsigned long dirty_limit_tstamp;
+	unsigned long dirty_limit;
+};
+
+/**
+ * wb_domain_size_changed - memory available to a wb_domain has changed
+ * @dom: wb_domain of interest
+ *
+ * This function should be called when the amount of memory available to
+ * @dom has changed.  It resets @dom's dirty limit parameters to prevent
+ * the past values which don't match the current configuration from skewing
+ * dirty throttling.  Without this, when memory size of a wb_domain is
+ * greatly reduced, the dirty throttling logic may allow too many pages to
+ * be dirtied leading to consecutive unnecessary OOMs and may get stuck in
+ * that situation.
+ */
+static inline void wb_domain_size_changed(struct wb_domain *dom)
+{
+	spin_lock(&dom->lock);
+	dom->dirty_limit_tstamp = jiffies;
+	dom->dirty_limit = 0;
+	spin_unlock(&dom->lock);
+}
+
+/*
  * fs/fs-writeback.c
  */	
 struct bdi_writeback;
 void writeback_inodes_sb(struct super_block *, enum wb_reason reason);
 void writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
 							enum wb_reason reason);
-int try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason);
-int try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
-				  enum wb_reason reason);
+bool try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason);
+bool try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
+				   enum wb_reason reason);
 void sync_inodes_sb(struct super_block *);
 void wakeup_flusher_threads(long nr_pages, enum wb_reason reason);
 void inode_wait_for_writeback(struct inode *inode);
@@ -107,6 +186,123 @@ static inline void wait_on_inode(struct inode *inode)
 	wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE);
 }
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+#include <linux/cgroup.h>
+#include <linux/bio.h>
+
+void __inode_attach_wb(struct inode *inode, struct page *page);
+void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
+				 struct inode *inode)
+	__releases(&inode->i_lock);
+void wbc_detach_inode(struct writeback_control *wbc);
+void wbc_account_io(struct writeback_control *wbc, struct page *page,
+		    size_t bytes);
+
+/**
+ * inode_attach_wb - associate an inode with its wb
+ * @inode: inode of interest
+ * @page: page being dirtied (may be NULL)
+ *
+ * If @inode doesn't have its wb, associate it with the wb matching the
+ * memcg of @page or, if @page is NULL, %current.  May be called w/ or w/o
+ * @inode->i_lock.
+ */
+static inline void inode_attach_wb(struct inode *inode, struct page *page)
+{
+	if (!inode->i_wb)
+		__inode_attach_wb(inode, page);
+}
+
+/**
+ * inode_detach_wb - disassociate an inode from its wb
+ * @inode: inode of interest
+ *
+ * @inode is being freed.  Detach from its wb.
+ */
+static inline void inode_detach_wb(struct inode *inode)
+{
+	if (inode->i_wb) {
+		wb_put(inode->i_wb);
+		inode->i_wb = NULL;
+	}
+}
+
+/**
+ * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite
+ * @wbc: writeback_control of interest
+ * @inode: target inode
+ *
+ * This function is to be used by __filemap_fdatawrite_range(), which is an
+ * alternative entry point into writeback code, and first ensures @inode is
+ * associated with a bdi_writeback and attaches it to @wbc.
+ */
+static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
+					       struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+	inode_attach_wb(inode, NULL);
+	wbc_attach_and_unlock_inode(wbc, inode);
+}
+
+/**
+ * wbc_init_bio - writeback specific initializtion of bio
+ * @wbc: writeback_control for the writeback in progress
+ * @bio: bio to be initialized
+ *
+ * @bio is a part of the writeback in progress controlled by @wbc.  Perform
+ * writeback specific initialization.  This is used to apply the cgroup
+ * writeback context.
+ */
+static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
+{
+	/*
+	 * pageout() path doesn't attach @wbc to the inode being written
+	 * out.  This is intentional as we don't want the function to block
+	 * behind a slow cgroup.  Ultimately, we want pageout() to kick off
+	 * regular writeback instead of writing things out itself.
+	 */
+	if (wbc->wb)
+		bio_associate_blkcg(bio, wbc->wb->blkcg_css);
+}
+
+#else	/* CONFIG_CGROUP_WRITEBACK */
+
+static inline void inode_attach_wb(struct inode *inode, struct page *page)
+{
+}
+
+static inline void inode_detach_wb(struct inode *inode)
+{
+}
+
+static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
+					       struct inode *inode)
+	__releases(&inode->i_lock)
+{
+	spin_unlock(&inode->i_lock);
+}
+
+static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
+					       struct inode *inode)
+{
+}
+
+static inline void wbc_detach_inode(struct writeback_control *wbc)
+{
+}
+
+static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
+{
+}
+
+static inline void wbc_account_io(struct writeback_control *wbc,
+				  struct page *page, size_t bytes)
+{
+}
+
+#endif	/* CONFIG_CGROUP_WRITEBACK */
+
 /*
  * mm/page-writeback.c
  */
@@ -120,8 +316,12 @@ static inline void laptop_sync_completion(void) { }
 #endif
 void throttle_vm_writeout(gfp_t gfp_mask);
 bool zone_dirty_ok(struct zone *zone);
+int wb_domain_init(struct wb_domain *dom, gfp_t gfp);
+#ifdef CONFIG_CGROUP_WRITEBACK
+void wb_domain_exit(struct wb_domain *dom);
+#endif
 
-extern unsigned long global_dirty_limit;
+extern struct wb_domain global_wb_domain;
 
 /* These are exported to sysctl. */
 extern int dirty_background_ratio;
@@ -155,19 +355,12 @@ int dirty_writeback_centisecs_handler(struct ctl_table *, int,
 				      void __user *, size_t *, loff_t *);
 
 void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
-unsigned long bdi_dirty_limit(struct backing_dev_info *bdi,
-			       unsigned long dirty);
-
-void __bdi_update_bandwidth(struct backing_dev_info *bdi,
-			    unsigned long thresh,
-			    unsigned long bg_thresh,
-			    unsigned long dirty,
-			    unsigned long bdi_thresh,
-			    unsigned long bdi_dirty,
-			    unsigned long start_time);
+unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
 
+void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time);
 void page_writeback_init(void);
 void balance_dirty_pages_ratelimited(struct address_space *mapping);
+bool wb_over_bg_thresh(struct bdi_writeback *wb);
 
 typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
 				void *data);
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index c178d13d6f4c0c..a7aa607a4c55e5 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -360,7 +360,7 @@ TRACE_EVENT(global_dirty_state,
 		__entry->nr_written	= global_page_state(NR_WRITTEN);
 		__entry->background_thresh = background_thresh;
 		__entry->dirty_thresh	= dirty_thresh;
-		__entry->dirty_limit = global_dirty_limit;
+		__entry->dirty_limit	= global_wb_domain.dirty_limit;
 	),
 
 	TP_printk("dirty=%lu writeback=%lu unstable=%lu "
@@ -399,13 +399,13 @@ TRACE_EVENT(bdi_dirty_ratelimit,
 
 	TP_fast_assign(
 		strlcpy(__entry->bdi, dev_name(bdi->dev), 32);
-		__entry->write_bw	= KBps(bdi->write_bandwidth);
-		__entry->avg_write_bw	= KBps(bdi->avg_write_bandwidth);
+		__entry->write_bw	= KBps(bdi->wb.write_bandwidth);
+		__entry->avg_write_bw	= KBps(bdi->wb.avg_write_bandwidth);
 		__entry->dirty_rate	= KBps(dirty_rate);
-		__entry->dirty_ratelimit = KBps(bdi->dirty_ratelimit);
+		__entry->dirty_ratelimit = KBps(bdi->wb.dirty_ratelimit);
 		__entry->task_ratelimit	= KBps(task_ratelimit);
 		__entry->balanced_dirty_ratelimit =
-					  KBps(bdi->balanced_dirty_ratelimit);
+					KBps(bdi->wb.balanced_dirty_ratelimit);
 	),
 
 	TP_printk("bdi %s: "
@@ -462,8 +462,9 @@ TRACE_EVENT(balance_dirty_pages,
 		unsigned long freerun = (thresh + bg_thresh) / 2;
 		strlcpy(__entry->bdi, dev_name(bdi->dev), 32);
 
-		__entry->limit		= global_dirty_limit;
-		__entry->setpoint	= (global_dirty_limit + freerun) / 2;
+		__entry->limit		= global_wb_domain.dirty_limit;
+		__entry->setpoint	= (global_wb_domain.dirty_limit +
+						freerun) / 2;
 		__entry->dirty		= dirty;
 		__entry->bdi_setpoint	= __entry->setpoint *
 						bdi_thresh / (thresh + 1);
diff --git a/include/uapi/linux/nbd.h b/include/uapi/linux/nbd.h
index 4f52549b23ff87..e08e413d5f71c9 100644
--- a/include/uapi/linux/nbd.h
+++ b/include/uapi/linux/nbd.h
@@ -44,8 +44,6 @@ enum {
 /* there is a gap here to match userspace */
 #define NBD_FLAG_SEND_TRIM    (1 << 5) /* send trim/discard */
 
-#define nbd_cmd(req) ((req)->cmd[0])
-
 /* userspace doesn't need the nbd_device structure */
 
 /* These are sent over the network in the request/reply magic fields */
diff --git a/init/Kconfig b/init/Kconfig
index af48acda4915b9..f9cac0b037cc30 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1141,6 +1141,11 @@ config DEBUG_BLK_CGROUP
 	Enable some debugging help. Currently it exports additional stat
 	files in a cgroup which can be useful for debugging.
 
+config CGROUP_WRITEBACK
+	bool
+	depends on MEMCG && BLK_CGROUP
+	default y
+
 endif # CGROUPS
 
 config CHECKPOINT_RESTORE
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 29472bff11ef9b..cb880a14cc396e 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -7,8 +7,7 @@ obj-$(CONFIG_VT_CONSOLE_SLEEP)	+= console.o
 obj-$(CONFIG_FREEZER)		+= process.o
 obj-$(CONFIG_SUSPEND)		+= suspend.o
 obj-$(CONFIG_PM_TEST_SUSPEND)	+= suspend_test.o
-obj-$(CONFIG_HIBERNATION)	+= hibernate.o snapshot.o swap.o user.o \
-				   block_io.o
+obj-$(CONFIG_HIBERNATION)	+= hibernate.o snapshot.o swap.o user.o
 obj-$(CONFIG_PM_AUTOSLEEP)	+= autosleep.o
 obj-$(CONFIG_PM_WAKELOCKS)	+= wakelock.o
 
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
deleted file mode 100644
index 9a58bc25881059..00000000000000
--- a/kernel/power/block_io.c
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * This file provides functions for block I/O operations on swap/file.
- *
- * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
- * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
- *
- * This file is released under the GPLv2.
- */
-
-#include <linux/bio.h>
-#include <linux/kernel.h>
-#include <linux/pagemap.h>
-#include <linux/swap.h>
-
-#include "power.h"
-
-/**
- *	submit - submit BIO request.
- *	@rw:	READ or WRITE.
- *	@off	physical offset of page.
- *	@page:	page we're reading or writing.
- *	@bio_chain: list of pending biod (for async reading)
- *
- *	Straight from the textbook - allocate and initialize the bio.
- *	If we're reading, make sure the page is marked as dirty.
- *	Then submit it and, if @bio_chain == NULL, wait.
- */
-static int submit(int rw, struct block_device *bdev, sector_t sector,
-		struct page *page, struct bio **bio_chain)
-{
-	const int bio_rw = rw | REQ_SYNC;
-	struct bio *bio;
-
-	bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
-	bio->bi_iter.bi_sector = sector;
-	bio->bi_bdev = bdev;
-	bio->bi_end_io = end_swap_bio_read;
-
-	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
-		printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
-			(unsigned long long)sector);
-		bio_put(bio);
-		return -EFAULT;
-	}
-
-	lock_page(page);
-	bio_get(bio);
-
-	if (bio_chain == NULL) {
-		submit_bio(bio_rw, bio);
-		wait_on_page_locked(page);
-		if (rw == READ)
-			bio_set_pages_dirty(bio);
-		bio_put(bio);
-	} else {
-		if (rw == READ)
-			get_page(page);	/* These pages are freed later */
-		bio->bi_private = *bio_chain;
-		*bio_chain = bio;
-		submit_bio(bio_rw, bio);
-	}
-	return 0;
-}
-
-int hib_bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
-{
-	return submit(READ, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
-			virt_to_page(addr), bio_chain);
-}
-
-int hib_bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
-{
-	return submit(WRITE, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
-			virt_to_page(addr), bio_chain);
-}
-
-int hib_wait_on_bio_chain(struct bio **bio_chain)
-{
-	struct bio *bio;
-	struct bio *next_bio;
-	int ret = 0;
-
-	if (bio_chain == NULL)
-		return 0;
-
-	bio = *bio_chain;
-	if (bio == NULL)
-		return 0;
-	while (bio) {
-		struct page *page;
-
-		next_bio = bio->bi_private;
-		page = bio->bi_io_vec[0].bv_page;
-		wait_on_page_locked(page);
-		if (!PageUptodate(page) || PageError(page))
-			ret = -EIO;
-		put_page(page);
-		bio_put(bio);
-		bio = next_bio;
-	}
-	*bio_chain = NULL;
-	return ret;
-}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index ce9b8328a68911..caadb566e82bb5 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -163,15 +163,6 @@ extern void swsusp_close(fmode_t);
 extern int swsusp_unmark(void);
 #endif
 
-/* kernel/power/block_io.c */
-extern struct block_device *hib_resume_bdev;
-
-extern int hib_bio_read_page(pgoff_t page_off, void *addr,
-		struct bio **bio_chain);
-extern int hib_bio_write_page(pgoff_t page_off, void *addr,
-		struct bio **bio_chain);
-extern int hib_wait_on_bio_chain(struct bio **bio_chain);
-
 struct timeval;
 /* kernel/power/swsusp.c */
 extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 570aff817543fc..2f30ca91e4fadf 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -212,7 +212,84 @@ int swsusp_swap_in_use(void)
  */
 
 static unsigned short root_swap = 0xffff;
-struct block_device *hib_resume_bdev;
+static struct block_device *hib_resume_bdev;
+
+struct hib_bio_batch {
+	atomic_t		count;
+	wait_queue_head_t	wait;
+	int			error;
+};
+
+static void hib_init_batch(struct hib_bio_batch *hb)
+{
+	atomic_set(&hb->count, 0);
+	init_waitqueue_head(&hb->wait);
+	hb->error = 0;
+}
+
+static void hib_end_io(struct bio *bio, int error)
+{
+	struct hib_bio_batch *hb = bio->bi_private;
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct page *page = bio->bi_io_vec[0].bv_page;
+
+	if (!uptodate || error) {
+		printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
+				imajor(bio->bi_bdev->bd_inode),
+				iminor(bio->bi_bdev->bd_inode),
+				(unsigned long long)bio->bi_iter.bi_sector);
+
+		if (!error)
+			error = -EIO;
+	}
+
+	if (bio_data_dir(bio) == WRITE)
+		put_page(page);
+
+	if (error && !hb->error)
+		hb->error = error;
+	if (atomic_dec_and_test(&hb->count))
+		wake_up(&hb->wait);
+
+	bio_put(bio);
+}
+
+static int hib_submit_io(int rw, pgoff_t page_off, void *addr,
+		struct hib_bio_batch *hb)
+{
+	struct page *page = virt_to_page(addr);
+	struct bio *bio;
+	int error = 0;
+
+	bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
+	bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
+	bio->bi_bdev = hib_resume_bdev;
+
+	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
+		printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
+			(unsigned long long)bio->bi_iter.bi_sector);
+		bio_put(bio);
+		return -EFAULT;
+	}
+
+	if (hb) {
+		bio->bi_end_io = hib_end_io;
+		bio->bi_private = hb;
+		atomic_inc(&hb->count);
+		submit_bio(rw, bio);
+	} else {
+		error = submit_bio_wait(rw, bio);
+		bio_put(bio);
+	}
+
+	return error;
+}
+
+static int hib_wait_io(struct hib_bio_batch *hb)
+{
+	wait_event(hb->wait, atomic_read(&hb->count) == 0);
+	return hb->error;
+}
 
 /*
  * Saving part
@@ -222,7 +299,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
 {
 	int error;
 
-	hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
+	hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL);
 	if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
 	    !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
 		memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
@@ -231,7 +308,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
 		swsusp_header->flags = flags;
 		if (flags & SF_CRC32_MODE)
 			swsusp_header->crc32 = handle->crc32;
-		error = hib_bio_write_page(swsusp_resume_block,
+		error = hib_submit_io(WRITE_SYNC, swsusp_resume_block,
 					swsusp_header, NULL);
 	} else {
 		printk(KERN_ERR "PM: Swap header not found!\n");
@@ -271,10 +348,10 @@ static int swsusp_swap_check(void)
  *	write_page - Write one page to given swap location.
  *	@buf:		Address we're writing.
  *	@offset:	Offset of the swap page we're writing to.
- *	@bio_chain:	Link the next write BIO here
+ *	@hb:		bio completion batch
  */
 
-static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
+static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
 {
 	void *src;
 	int ret;
@@ -282,13 +359,13 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
 	if (!offset)
 		return -ENOSPC;
 
-	if (bio_chain) {
+	if (hb) {
 		src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN |
 		                              __GFP_NORETRY);
 		if (src) {
 			copy_page(src, buf);
 		} else {
-			ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */
+			ret = hib_wait_io(hb); /* Free pages */
 			if (ret)
 				return ret;
 			src = (void *)__get_free_page(__GFP_WAIT |
@@ -298,14 +375,14 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
 				copy_page(src, buf);
 			} else {
 				WARN_ON_ONCE(1);
-				bio_chain = NULL;	/* Go synchronous */
+				hb = NULL;	/* Go synchronous */
 				src = buf;
 			}
 		}
 	} else {
 		src = buf;
 	}
-	return hib_bio_write_page(offset, src, bio_chain);
+	return hib_submit_io(WRITE_SYNC, offset, src, hb);
 }
 
 static void release_swap_writer(struct swap_map_handle *handle)
@@ -348,7 +425,7 @@ err_close:
 }
 
 static int swap_write_page(struct swap_map_handle *handle, void *buf,
-				struct bio **bio_chain)
+		struct hib_bio_batch *hb)
 {
 	int error = 0;
 	sector_t offset;
@@ -356,7 +433,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
 	if (!handle->cur)
 		return -EINVAL;
 	offset = alloc_swapdev_block(root_swap);
-	error = write_page(buf, offset, bio_chain);
+	error = write_page(buf, offset, hb);
 	if (error)
 		return error;
 	handle->cur->entries[handle->k++] = offset;
@@ -365,15 +442,15 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
 		if (!offset)
 			return -ENOSPC;
 		handle->cur->next_swap = offset;
-		error = write_page(handle->cur, handle->cur_swap, bio_chain);
+		error = write_page(handle->cur, handle->cur_swap, hb);
 		if (error)
 			goto out;
 		clear_page(handle->cur);
 		handle->cur_swap = offset;
 		handle->k = 0;
 
-		if (bio_chain && low_free_pages() <= handle->reqd_free_pages) {
-			error = hib_wait_on_bio_chain(bio_chain);
+		if (hb && low_free_pages() <= handle->reqd_free_pages) {
+			error = hib_wait_io(hb);
 			if (error)
 				goto out;
 			/*
@@ -445,23 +522,24 @@ static int save_image(struct swap_map_handle *handle,
 	int ret;
 	int nr_pages;
 	int err2;
-	struct bio *bio;
+	struct hib_bio_batch hb;
 	ktime_t start;
 	ktime_t stop;
 
+	hib_init_batch(&hb);
+
 	printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n",
 		nr_to_write);
 	m = nr_to_write / 10;
 	if (!m)
 		m = 1;
 	nr_pages = 0;
-	bio = NULL;
 	start = ktime_get();
 	while (1) {
 		ret = snapshot_read_next(snapshot);
 		if (ret <= 0)
 			break;
-		ret = swap_write_page(handle, data_of(*snapshot), &bio);
+		ret = swap_write_page(handle, data_of(*snapshot), &hb);
 		if (ret)
 			break;
 		if (!(nr_pages % m))
@@ -469,7 +547,7 @@ static int save_image(struct swap_map_handle *handle,
 			       nr_pages / m * 10);
 		nr_pages++;
 	}
-	err2 = hib_wait_on_bio_chain(&bio);
+	err2 = hib_wait_io(&hb);
 	stop = ktime_get();
 	if (!ret)
 		ret = err2;
@@ -580,7 +658,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
 	int ret = 0;
 	int nr_pages;
 	int err2;
-	struct bio *bio;
+	struct hib_bio_batch hb;
 	ktime_t start;
 	ktime_t stop;
 	size_t off;
@@ -589,6 +667,8 @@ static int save_image_lzo(struct swap_map_handle *handle,
 	struct cmp_data *data = NULL;
 	struct crc_data *crc = NULL;
 
+	hib_init_batch(&hb);
+
 	/*
 	 * We'll limit the number of threads for compression to limit memory
 	 * footprint.
@@ -674,7 +754,6 @@ static int save_image_lzo(struct swap_map_handle *handle,
 	if (!m)
 		m = 1;
 	nr_pages = 0;
-	bio = NULL;
 	start = ktime_get();
 	for (;;) {
 		for (thr = 0; thr < nr_threads; thr++) {
@@ -748,7 +827,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
 			     off += PAGE_SIZE) {
 				memcpy(page, data[thr].cmp + off, PAGE_SIZE);
 
-				ret = swap_write_page(handle, page, &bio);
+				ret = swap_write_page(handle, page, &hb);
 				if (ret)
 					goto out_finish;
 			}
@@ -759,7 +838,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
 	}
 
 out_finish:
-	err2 = hib_wait_on_bio_chain(&bio);
+	err2 = hib_wait_io(&hb);
 	stop = ktime_get();
 	if (!ret)
 		ret = err2;
@@ -906,7 +985,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
 			return -ENOMEM;
 		}
 
-		error = hib_bio_read_page(offset, tmp->map, NULL);
+		error = hib_submit_io(READ_SYNC, offset, tmp->map, NULL);
 		if (error) {
 			release_swap_reader(handle);
 			return error;
@@ -919,7 +998,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
 }
 
 static int swap_read_page(struct swap_map_handle *handle, void *buf,
-				struct bio **bio_chain)
+		struct hib_bio_batch *hb)
 {
 	sector_t offset;
 	int error;
@@ -930,7 +1009,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
 	offset = handle->cur->entries[handle->k];
 	if (!offset)
 		return -EFAULT;
-	error = hib_bio_read_page(offset, buf, bio_chain);
+	error = hib_submit_io(READ_SYNC, offset, buf, hb);
 	if (error)
 		return error;
 	if (++handle->k >= MAP_PAGE_ENTRIES) {
@@ -968,27 +1047,28 @@ static int load_image(struct swap_map_handle *handle,
 	int ret = 0;
 	ktime_t start;
 	ktime_t stop;
-	struct bio *bio;
+	struct hib_bio_batch hb;
 	int err2;
 	unsigned nr_pages;
 
+	hib_init_batch(&hb);
+
 	printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n",
 		nr_to_read);
 	m = nr_to_read / 10;
 	if (!m)
 		m = 1;
 	nr_pages = 0;
-	bio = NULL;
 	start = ktime_get();
 	for ( ; ; ) {
 		ret = snapshot_write_next(snapshot);
 		if (ret <= 0)
 			break;
-		ret = swap_read_page(handle, data_of(*snapshot), &bio);
+		ret = swap_read_page(handle, data_of(*snapshot), &hb);
 		if (ret)
 			break;
 		if (snapshot->sync_read)
-			ret = hib_wait_on_bio_chain(&bio);
+			ret = hib_wait_io(&hb);
 		if (ret)
 			break;
 		if (!(nr_pages % m))
@@ -996,7 +1076,7 @@ static int load_image(struct swap_map_handle *handle,
 			       nr_pages / m * 10);
 		nr_pages++;
 	}
-	err2 = hib_wait_on_bio_chain(&bio);
+	err2 = hib_wait_io(&hb);
 	stop = ktime_get();
 	if (!ret)
 		ret = err2;
@@ -1067,7 +1147,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
 	unsigned int m;
 	int ret = 0;
 	int eof = 0;
-	struct bio *bio;
+	struct hib_bio_batch hb;
 	ktime_t start;
 	ktime_t stop;
 	unsigned nr_pages;
@@ -1080,6 +1160,8 @@ static int load_image_lzo(struct swap_map_handle *handle,
 	struct dec_data *data = NULL;
 	struct crc_data *crc = NULL;
 
+	hib_init_batch(&hb);
+
 	/*
 	 * We'll limit the number of threads for decompression to limit memory
 	 * footprint.
@@ -1190,7 +1272,6 @@ static int load_image_lzo(struct swap_map_handle *handle,
 	if (!m)
 		m = 1;
 	nr_pages = 0;
-	bio = NULL;
 	start = ktime_get();
 
 	ret = snapshot_write_next(snapshot);
@@ -1199,7 +1280,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
 
 	for(;;) {
 		for (i = 0; !eof && i < want; i++) {
-			ret = swap_read_page(handle, page[ring], &bio);
+			ret = swap_read_page(handle, page[ring], &hb);
 			if (ret) {
 				/*
 				 * On real read error, finish. On end of data,
@@ -1226,7 +1307,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
 			if (!asked)
 				break;
 
-			ret = hib_wait_on_bio_chain(&bio);
+			ret = hib_wait_io(&hb);
 			if (ret)
 				goto out_finish;
 			have += asked;
@@ -1281,7 +1362,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
 		 * Wait for more data while we are decompressing.
 		 */
 		if (have < LZO_CMP_PAGES && asked) {
-			ret = hib_wait_on_bio_chain(&bio);
+			ret = hib_wait_io(&hb);
 			if (ret)
 				goto out_finish;
 			have += asked;
@@ -1430,7 +1511,7 @@ int swsusp_check(void)
 	if (!IS_ERR(hib_resume_bdev)) {
 		set_blocksize(hib_resume_bdev, PAGE_SIZE);
 		clear_page(swsusp_header);
-		error = hib_bio_read_page(swsusp_resume_block,
+		error = hib_submit_io(READ_SYNC, swsusp_resume_block,
 					swsusp_header, NULL);
 		if (error)
 			goto put;
@@ -1438,7 +1519,7 @@ int swsusp_check(void)
 		if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) {
 			memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
 			/* Reset swap signature now */
-			error = hib_bio_write_page(swsusp_resume_block,
+			error = hib_submit_io(WRITE_SYNC, swsusp_resume_block,
 						swsusp_header, NULL);
 		} else {
 			error = -EINVAL;
@@ -1482,10 +1563,10 @@ int swsusp_unmark(void)
 {
 	int error;
 
-	hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
+	hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL);
 	if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) {
 		memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10);
-		error = hib_bio_write_page(swsusp_resume_block,
+		error = hib_submit_io(WRITE_SYNC, swsusp_resume_block,
 					swsusp_header, NULL);
 	} else {
 		printk(KERN_ERR "PM: Cannot find swsusp signature!\n");
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 4abda074ea4589..341268841b310b 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -29,10 +29,10 @@
 #include <linux/ctype.h>
 #include <linux/highmem.h>
 #include <linux/gfp.h>
+#include <linux/scatterlist.h>
 
 #include <asm/io.h>
 #include <asm/dma.h>
-#include <asm/scatterlist.h>
 
 #include <linux/init.h>
 #include <linux/bootmem.h>
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 000e7b3b9896f2..79b4309dfab092 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -18,6 +18,7 @@ struct backing_dev_info noop_backing_dev_info = {
 	.name		= "noop",
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
+EXPORT_SYMBOL_GPL(noop_backing_dev_info);
 
 static struct class *bdi_class;
 
@@ -48,7 +49,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 	struct bdi_writeback *wb = &bdi->wb;
 	unsigned long background_thresh;
 	unsigned long dirty_thresh;
-	unsigned long bdi_thresh;
+	unsigned long wb_thresh;
 	unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
 	struct inode *inode;
 
@@ -66,7 +67,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 	spin_unlock(&wb->list_lock);
 
 	global_dirty_limits(&background_thresh, &dirty_thresh);
-	bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
+	wb_thresh = wb_calc_thresh(wb, dirty_thresh);
 
 #define K(x) ((x) << (PAGE_SHIFT - 10))
 	seq_printf(m,
@@ -84,19 +85,19 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 		   "b_dirty_time:       %10lu\n"
 		   "bdi_list:           %10u\n"
 		   "state:              %10lx\n",
-		   (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
-		   (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
-		   K(bdi_thresh),
+		   (unsigned long) K(wb_stat(wb, WB_WRITEBACK)),
+		   (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)),
+		   K(wb_thresh),
 		   K(dirty_thresh),
 		   K(background_thresh),
-		   (unsigned long) K(bdi_stat(bdi, BDI_DIRTIED)),
-		   (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
-		   (unsigned long) K(bdi->write_bandwidth),
+		   (unsigned long) K(wb_stat(wb, WB_DIRTIED)),
+		   (unsigned long) K(wb_stat(wb, WB_WRITTEN)),
+		   (unsigned long) K(wb->write_bandwidth),
 		   nr_dirty,
 		   nr_io,
 		   nr_more_io,
 		   nr_dirty_time,
-		   !list_empty(&bdi->bdi_list), bdi->state);
+		   !list_empty(&bdi->bdi_list), bdi->wb.state);
 #undef K
 
 	return 0;
@@ -255,13 +256,8 @@ static int __init default_bdi_init(void)
 }
 subsys_initcall(default_bdi_init);
 
-int bdi_has_dirty_io(struct backing_dev_info *bdi)
-{
-	return wb_has_dirty_io(&bdi->wb);
-}
-
 /*
- * This function is used when the first inode for this bdi is marked dirty. It
+ * This function is used when the first inode for this wb is marked dirty. It
  * wakes-up the corresponding bdi thread which should then take care of the
  * periodic background write-out of dirty inodes. Since the write-out would
  * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
@@ -274,162 +270,551 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
  * We have to be careful not to postpone flush work if it is scheduled for
  * earlier. Thus we use queue_delayed_work().
  */
-void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
+void wb_wakeup_delayed(struct bdi_writeback *wb)
 {
 	unsigned long timeout;
 
 	timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
-	spin_lock_bh(&bdi->wb_lock);
-	if (test_bit(BDI_registered, &bdi->state))
-		queue_delayed_work(bdi_wq, &bdi->wb.dwork, timeout);
-	spin_unlock_bh(&bdi->wb_lock);
+	spin_lock_bh(&wb->work_lock);
+	if (test_bit(WB_registered, &wb->state))
+		queue_delayed_work(bdi_wq, &wb->dwork, timeout);
+	spin_unlock_bh(&wb->work_lock);
 }
 
 /*
- * Remove bdi from bdi_list, and ensure that it is no longer visible
+ * Initial write bandwidth: 100 MB/s
  */
-static void bdi_remove_from_list(struct backing_dev_info *bdi)
-{
-	spin_lock_bh(&bdi_lock);
-	list_del_rcu(&bdi->bdi_list);
-	spin_unlock_bh(&bdi_lock);
-
-	synchronize_rcu_expedited();
-}
+#define INIT_BW		(100 << (20 - PAGE_SHIFT))
 
-int bdi_register(struct backing_dev_info *bdi, struct device *parent,
-		const char *fmt, ...)
+static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
+		   gfp_t gfp)
 {
-	va_list args;
-	struct device *dev;
+	int i, err;
 
-	if (bdi->dev)	/* The driver needs to use separate queues per device */
-		return 0;
+	memset(wb, 0, sizeof(*wb));
 
-	va_start(args, fmt);
-	dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
-	va_end(args);
-	if (IS_ERR(dev))
-		return PTR_ERR(dev);
+	wb->bdi = bdi;
+	wb->last_old_flush = jiffies;
+	INIT_LIST_HEAD(&wb->b_dirty);
+	INIT_LIST_HEAD(&wb->b_io);
+	INIT_LIST_HEAD(&wb->b_more_io);
+	INIT_LIST_HEAD(&wb->b_dirty_time);
+	spin_lock_init(&wb->list_lock);
 
-	bdi->dev = dev;
+	wb->bw_time_stamp = jiffies;
+	wb->balanced_dirty_ratelimit = INIT_BW;
+	wb->dirty_ratelimit = INIT_BW;
+	wb->write_bandwidth = INIT_BW;
+	wb->avg_write_bandwidth = INIT_BW;
 
-	bdi_debug_register(bdi, dev_name(dev));
-	set_bit(BDI_registered, &bdi->state);
+	spin_lock_init(&wb->work_lock);
+	INIT_LIST_HEAD(&wb->work_list);
+	INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
 
-	spin_lock_bh(&bdi_lock);
-	list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
-	spin_unlock_bh(&bdi_lock);
+	err = fprop_local_init_percpu(&wb->completions, gfp);
+	if (err)
+		return err;
 
-	trace_writeback_bdi_register(bdi);
-	return 0;
-}
-EXPORT_SYMBOL(bdi_register);
+	for (i = 0; i < NR_WB_STAT_ITEMS; i++) {
+		err = percpu_counter_init(&wb->stat[i], 0, gfp);
+		if (err) {
+			while (--i)
+				percpu_counter_destroy(&wb->stat[i]);
+			fprop_local_destroy_percpu(&wb->completions);
+			return err;
+		}
+	}
 
-int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
-{
-	return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
+	return 0;
 }
-EXPORT_SYMBOL(bdi_register_dev);
 
 /*
  * Remove bdi from the global list and shutdown any threads we have running
  */
-static void bdi_wb_shutdown(struct backing_dev_info *bdi)
+static void wb_shutdown(struct bdi_writeback *wb)
 {
 	/* Make sure nobody queues further work */
-	spin_lock_bh(&bdi->wb_lock);
-	if (!test_and_clear_bit(BDI_registered, &bdi->state)) {
-		spin_unlock_bh(&bdi->wb_lock);
+	spin_lock_bh(&wb->work_lock);
+	if (!test_and_clear_bit(WB_registered, &wb->state)) {
+		spin_unlock_bh(&wb->work_lock);
 		return;
 	}
-	spin_unlock_bh(&bdi->wb_lock);
+	spin_unlock_bh(&wb->work_lock);
 
 	/*
-	 * Make sure nobody finds us on the bdi_list anymore
+	 * Drain work list and shutdown the delayed_work.  !WB_registered
+	 * tells wb_workfn() that @wb is dying and its work_list needs to
+	 * be drained no matter what.
 	 */
-	bdi_remove_from_list(bdi);
+	mod_delayed_work(bdi_wq, &wb->dwork, 0);
+	flush_delayed_work(&wb->dwork);
+	WARN_ON(!list_empty(&wb->work_list));
+}
+
+static void wb_exit(struct bdi_writeback *wb)
+{
+	int i;
+
+	WARN_ON(delayed_work_pending(&wb->dwork));
+
+	for (i = 0; i < NR_WB_STAT_ITEMS; i++)
+		percpu_counter_destroy(&wb->stat[i]);
+
+	fprop_local_destroy_percpu(&wb->completions);
+}
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+#include <linux/memcontrol.h>
+
+/*
+ * cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree,
+ * blkcg->cgwb_list, and memcg->cgwb_list.  bdi->cgwb_tree is also RCU
+ * protected.  cgwb_release_wait is used to wait for the completion of cgwb
+ * releases from bdi destruction path.
+ */
+static DEFINE_SPINLOCK(cgwb_lock);
+static DECLARE_WAIT_QUEUE_HEAD(cgwb_release_wait);
+
+/**
+ * wb_congested_get_create - get or create a wb_congested
+ * @bdi: associated bdi
+ * @blkcg_id: ID of the associated blkcg
+ * @gfp: allocation mask
+ *
+ * Look up the wb_congested for @blkcg_id on @bdi.  If missing, create one.
+ * The returned wb_congested has its reference count incremented.  Returns
+ * NULL on failure.
+ */
+struct bdi_writeback_congested *
+wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp)
+{
+	struct bdi_writeback_congested *new_congested = NULL, *congested;
+	struct rb_node **node, *parent;
+	unsigned long flags;
+
+	if (blkcg_id == 1)
+		return &bdi->wb_congested;
+retry:
+	spin_lock_irqsave(&cgwb_lock, flags);
+
+	node = &bdi->cgwb_congested_tree.rb_node;
+	parent = NULL;
+
+	while (*node != NULL) {
+		parent = *node;
+		congested = container_of(parent, struct bdi_writeback_congested,
+					 rb_node);
+		if (congested->blkcg_id < blkcg_id)
+			node = &parent->rb_left;
+		else if (congested->blkcg_id > blkcg_id)
+			node = &parent->rb_right;
+		else
+			goto found;
+	}
+
+	if (new_congested) {
+		/* !found and storage for new one already allocated, insert */
+		congested = new_congested;
+		new_congested = NULL;
+		rb_link_node(&congested->rb_node, parent, node);
+		rb_insert_color(&congested->rb_node, &bdi->cgwb_congested_tree);
+		atomic_inc(&bdi->usage_cnt);
+		goto found;
+	}
+
+	spin_unlock_irqrestore(&cgwb_lock, flags);
+
+	/* allocate storage for new one and retry */
+	new_congested = kzalloc(sizeof(*new_congested), gfp);
+	if (!new_congested)
+		return NULL;
+
+	atomic_set(&new_congested->refcnt, 0);
+	new_congested->bdi = bdi;
+	new_congested->blkcg_id = blkcg_id;
+	goto retry;
+
+found:
+	atomic_inc(&congested->refcnt);
+	spin_unlock_irqrestore(&cgwb_lock, flags);
+	kfree(new_congested);
+	return congested;
+}
+
+/**
+ * wb_congested_put - put a wb_congested
+ * @congested: wb_congested to put
+ *
+ * Put @congested and destroy it if the refcnt reaches zero.
+ */
+void wb_congested_put(struct bdi_writeback_congested *congested)
+{
+	struct backing_dev_info *bdi = congested->bdi;
+	unsigned long flags;
+
+	if (congested->blkcg_id == 1)
+		return;
+
+	local_irq_save(flags);
+	if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
+		local_irq_restore(flags);
+		return;
+	}
+
+	rb_erase(&congested->rb_node, &congested->bdi->cgwb_congested_tree);
+	spin_unlock_irqrestore(&cgwb_lock, flags);
+	kfree(congested);
+
+	if (atomic_dec_and_test(&bdi->usage_cnt))
+		wake_up_all(&cgwb_release_wait);
+}
+
+static void cgwb_release_workfn(struct work_struct *work)
+{
+	struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
+						release_work);
+	struct backing_dev_info *bdi = wb->bdi;
+
+	wb_shutdown(wb);
+
+	css_put(wb->memcg_css);
+	css_put(wb->blkcg_css);
+	wb_congested_put(wb->congested);
+
+	fprop_local_destroy_percpu(&wb->memcg_completions);
+	percpu_ref_exit(&wb->refcnt);
+	wb_exit(wb);
+	kfree_rcu(wb, rcu);
+
+	if (atomic_dec_and_test(&bdi->usage_cnt))
+		wake_up_all(&cgwb_release_wait);
+}
+
+static void cgwb_release(struct percpu_ref *refcnt)
+{
+	struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback,
+						refcnt);
+	schedule_work(&wb->release_work);
+}
+
+static void cgwb_kill(struct bdi_writeback *wb)
+{
+	lockdep_assert_held(&cgwb_lock);
+
+	WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id));
+	list_del(&wb->memcg_node);
+	list_del(&wb->blkcg_node);
+	percpu_ref_kill(&wb->refcnt);
+}
+
+static int cgwb_create(struct backing_dev_info *bdi,
+		       struct cgroup_subsys_state *memcg_css, gfp_t gfp)
+{
+	struct mem_cgroup *memcg;
+	struct cgroup_subsys_state *blkcg_css;
+	struct blkcg *blkcg;
+	struct list_head *memcg_cgwb_list, *blkcg_cgwb_list;
+	struct bdi_writeback *wb;
+	unsigned long flags;
+	int ret = 0;
+
+	memcg = mem_cgroup_from_css(memcg_css);
+	blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &blkio_cgrp_subsys);
+	blkcg = css_to_blkcg(blkcg_css);
+	memcg_cgwb_list = mem_cgroup_cgwb_list(memcg);
+	blkcg_cgwb_list = &blkcg->cgwb_list;
+
+	/* look up again under lock and discard on blkcg mismatch */
+	spin_lock_irqsave(&cgwb_lock, flags);
+	wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
+	if (wb && wb->blkcg_css != blkcg_css) {
+		cgwb_kill(wb);
+		wb = NULL;
+	}
+	spin_unlock_irqrestore(&cgwb_lock, flags);
+	if (wb)
+		goto out_put;
+
+	/* need to create a new one */
+	wb = kmalloc(sizeof(*wb), gfp);
+	if (!wb)
+		return -ENOMEM;
+
+	ret = wb_init(wb, bdi, gfp);
+	if (ret)
+		goto err_free;
+
+	ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp);
+	if (ret)
+		goto err_wb_exit;
+
+	ret = fprop_local_init_percpu(&wb->memcg_completions, gfp);
+	if (ret)
+		goto err_ref_exit;
+
+	wb->congested = wb_congested_get_create(bdi, blkcg_css->id, gfp);
+	if (!wb->congested) {
+		ret = -ENOMEM;
+		goto err_fprop_exit;
+	}
+
+	wb->memcg_css = memcg_css;
+	wb->blkcg_css = blkcg_css;
+	INIT_WORK(&wb->release_work, cgwb_release_workfn);
+	set_bit(WB_registered, &wb->state);
 
 	/*
-	 * Drain work list and shutdown the delayed_work.  At this point,
-	 * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi
-	 * is dying and its work_list needs to be drained no matter what.
+	 * The root wb determines the registered state of the whole bdi and
+	 * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate
+	 * whether they're still online.  Don't link @wb if any is dead.
+	 * See wb_memcg_offline() and wb_blkcg_offline().
 	 */
-	mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
-	flush_delayed_work(&bdi->wb.dwork);
+	ret = -ENODEV;
+	spin_lock_irqsave(&cgwb_lock, flags);
+	if (test_bit(WB_registered, &bdi->wb.state) &&
+	    blkcg_cgwb_list->next && memcg_cgwb_list->next) {
+		/* we might have raced another instance of this function */
+		ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
+		if (!ret) {
+			atomic_inc(&bdi->usage_cnt);
+			list_add(&wb->memcg_node, memcg_cgwb_list);
+			list_add(&wb->blkcg_node, blkcg_cgwb_list);
+			css_get(memcg_css);
+			css_get(blkcg_css);
+		}
+	}
+	spin_unlock_irqrestore(&cgwb_lock, flags);
+	if (ret) {
+		if (ret == -EEXIST)
+			ret = 0;
+		goto err_put_congested;
+	}
+	goto out_put;
+
+err_put_congested:
+	wb_congested_put(wb->congested);
+err_fprop_exit:
+	fprop_local_destroy_percpu(&wb->memcg_completions);
+err_ref_exit:
+	percpu_ref_exit(&wb->refcnt);
+err_wb_exit:
+	wb_exit(wb);
+err_free:
+	kfree(wb);
+out_put:
+	css_put(blkcg_css);
+	return ret;
 }
 
-static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
+/**
+ * wb_get_create - get wb for a given memcg, create if necessary
+ * @bdi: target bdi
+ * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
+ * @gfp: allocation mask to use
+ *
+ * Try to get the wb for @memcg_css on @bdi.  If it doesn't exist, try to
+ * create one.  The returned wb has its refcount incremented.
+ *
+ * This function uses css_get() on @memcg_css and thus expects its refcnt
+ * to be positive on invocation.  IOW, rcu_read_lock() protection on
+ * @memcg_css isn't enough.  try_get it before calling this function.
+ *
+ * A wb is keyed by its associated memcg.  As blkcg implicitly enables
+ * memcg on the default hierarchy, memcg association is guaranteed to be
+ * more specific (equal or descendant to the associated blkcg) and thus can
+ * identify both the memcg and blkcg associations.
+ *
+ * Because the blkcg associated with a memcg may change as blkcg is enabled
+ * and disabled closer to root in the hierarchy, each wb keeps track of
+ * both the memcg and blkcg associated with it and verifies the blkcg on
+ * each lookup.  On mismatch, the existing wb is discarded and a new one is
+ * created.
+ */
+struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
+				    struct cgroup_subsys_state *memcg_css,
+				    gfp_t gfp)
 {
-	memset(wb, 0, sizeof(*wb));
+	struct bdi_writeback *wb;
+
+	might_sleep_if(gfp & __GFP_WAIT);
+
+	if (!memcg_css->parent)
+		return &bdi->wb;
+
+	do {
+		rcu_read_lock();
+		wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
+		if (wb) {
+			struct cgroup_subsys_state *blkcg_css;
+
+			/* see whether the blkcg association has changed */
+			blkcg_css = cgroup_get_e_css(memcg_css->cgroup,
+						     &blkio_cgrp_subsys);
+			if (unlikely(wb->blkcg_css != blkcg_css ||
+				     !wb_tryget(wb)))
+				wb = NULL;
+			css_put(blkcg_css);
+		}
+		rcu_read_unlock();
+	} while (!wb && !cgwb_create(bdi, memcg_css, gfp));
+
+	return wb;
+}
 
-	wb->bdi = bdi;
-	wb->last_old_flush = jiffies;
-	INIT_LIST_HEAD(&wb->b_dirty);
-	INIT_LIST_HEAD(&wb->b_io);
-	INIT_LIST_HEAD(&wb->b_more_io);
-	INIT_LIST_HEAD(&wb->b_dirty_time);
-	spin_lock_init(&wb->list_lock);
-	INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
+static void cgwb_bdi_init(struct backing_dev_info *bdi)
+{
+	bdi->wb.memcg_css = mem_cgroup_root_css;
+	bdi->wb.blkcg_css = blkcg_root_css;
+	bdi->wb_congested.blkcg_id = 1;
+	INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
+	bdi->cgwb_congested_tree = RB_ROOT;
+	atomic_set(&bdi->usage_cnt, 1);
 }
 
-/*
- * Initial write bandwidth: 100 MB/s
+static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
+{
+	struct radix_tree_iter iter;
+	void **slot;
+
+	WARN_ON(test_bit(WB_registered, &bdi->wb.state));
+
+	spin_lock_irq(&cgwb_lock);
+	radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
+		cgwb_kill(*slot);
+	spin_unlock_irq(&cgwb_lock);
+
+	/*
+	 * All cgwb's and their congested states must be shutdown and
+	 * released before returning.  Drain the usage counter to wait for
+	 * all cgwb's and cgwb_congested's ever created on @bdi.
+	 */
+	atomic_dec(&bdi->usage_cnt);
+	wait_event(cgwb_release_wait, !atomic_read(&bdi->usage_cnt));
+}
+
+/**
+ * wb_memcg_offline - kill all wb's associated with a memcg being offlined
+ * @memcg: memcg being offlined
+ *
+ * Also prevents creation of any new wb's associated with @memcg.
  */
-#define INIT_BW		(100 << (20 - PAGE_SHIFT))
+void wb_memcg_offline(struct mem_cgroup *memcg)
+{
+	LIST_HEAD(to_destroy);
+	struct list_head *memcg_cgwb_list = mem_cgroup_cgwb_list(memcg);
+	struct bdi_writeback *wb, *next;
+
+	spin_lock_irq(&cgwb_lock);
+	list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node)
+		cgwb_kill(wb);
+	memcg_cgwb_list->next = NULL;	/* prevent new wb's */
+	spin_unlock_irq(&cgwb_lock);
+}
+
+/**
+ * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined
+ * @blkcg: blkcg being offlined
+ *
+ * Also prevents creation of any new wb's associated with @blkcg.
+ */
+void wb_blkcg_offline(struct blkcg *blkcg)
+{
+	LIST_HEAD(to_destroy);
+	struct bdi_writeback *wb, *next;
+
+	spin_lock_irq(&cgwb_lock);
+	list_for_each_entry_safe(wb, next, &blkcg->cgwb_list, blkcg_node)
+		cgwb_kill(wb);
+	blkcg->cgwb_list.next = NULL;	/* prevent new wb's */
+	spin_unlock_irq(&cgwb_lock);
+}
+
+#else	/* CONFIG_CGROUP_WRITEBACK */
+
+static void cgwb_bdi_init(struct backing_dev_info *bdi) { }
+static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { }
+
+#endif	/* CONFIG_CGROUP_WRITEBACK */
 
 int bdi_init(struct backing_dev_info *bdi)
 {
-	int i, err;
+	int err;
 
 	bdi->dev = NULL;
 
 	bdi->min_ratio = 0;
 	bdi->max_ratio = 100;
 	bdi->max_prop_frac = FPROP_FRAC_BASE;
-	spin_lock_init(&bdi->wb_lock);
 	INIT_LIST_HEAD(&bdi->bdi_list);
-	INIT_LIST_HEAD(&bdi->work_list);
+	init_waitqueue_head(&bdi->wb_waitq);
 
-	bdi_wb_init(&bdi->wb, bdi);
+	err = wb_init(&bdi->wb, bdi, GFP_KERNEL);
+	if (err)
+		return err;
 
-	for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
-		err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL);
-		if (err)
-			goto err;
-	}
+	bdi->wb_congested.state = 0;
+	bdi->wb.congested = &bdi->wb_congested;
 
-	bdi->dirty_exceeded = 0;
+	cgwb_bdi_init(bdi);
+	return 0;
+}
+EXPORT_SYMBOL(bdi_init);
 
-	bdi->bw_time_stamp = jiffies;
-	bdi->written_stamp = 0;
+int bdi_register(struct backing_dev_info *bdi, struct device *parent,
+		const char *fmt, ...)
+{
+	va_list args;
+	struct device *dev;
 
-	bdi->balanced_dirty_ratelimit = INIT_BW;
-	bdi->dirty_ratelimit = INIT_BW;
-	bdi->write_bandwidth = INIT_BW;
-	bdi->avg_write_bandwidth = INIT_BW;
+	if (bdi->dev)	/* The driver needs to use separate queues per device */
+		return 0;
 
-	err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL);
+	va_start(args, fmt);
+	dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
+	va_end(args);
+	if (IS_ERR(dev))
+		return PTR_ERR(dev);
 
-	if (err) {
-err:
-		while (i--)
-			percpu_counter_destroy(&bdi->bdi_stat[i]);
-	}
+	bdi->dev = dev;
 
-	return err;
+	bdi_debug_register(bdi, dev_name(dev));
+	set_bit(WB_registered, &bdi->wb.state);
+
+	spin_lock_bh(&bdi_lock);
+	list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
+	spin_unlock_bh(&bdi_lock);
+
+	trace_writeback_bdi_register(bdi);
+	return 0;
 }
-EXPORT_SYMBOL(bdi_init);
+EXPORT_SYMBOL(bdi_register);
 
-void bdi_destroy(struct backing_dev_info *bdi)
+int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
 {
-	int i;
+	return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
+}
+EXPORT_SYMBOL(bdi_register_dev);
 
-	bdi_wb_shutdown(bdi);
-	bdi_set_min_ratio(bdi, 0);
+/*
+ * Remove bdi from bdi_list, and ensure that it is no longer visible
+ */
+static void bdi_remove_from_list(struct backing_dev_info *bdi)
+{
+	spin_lock_bh(&bdi_lock);
+	list_del_rcu(&bdi->bdi_list);
+	spin_unlock_bh(&bdi_lock);
 
-	WARN_ON(!list_empty(&bdi->work_list));
-	WARN_ON(delayed_work_pending(&bdi->wb.dwork));
+	synchronize_rcu_expedited();
+}
+
+void bdi_destroy(struct backing_dev_info *bdi)
+{
+	/* make sure nobody finds us on the bdi_list anymore */
+	bdi_remove_from_list(bdi);
+	wb_shutdown(&bdi->wb);
+	bdi_set_min_ratio(bdi, 0);
+	cgwb_bdi_destroy(bdi);
 
 	if (bdi->dev) {
 		bdi_debug_unregister(bdi);
@@ -437,9 +822,7 @@ void bdi_destroy(struct backing_dev_info *bdi)
 		bdi->dev = NULL;
 	}
 
-	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
-		percpu_counter_destroy(&bdi->bdi_stat[i]);
-	fprop_local_destroy_percpu(&bdi->completions);
+	wb_exit(&bdi->wb);
 }
 EXPORT_SYMBOL(bdi_destroy);
 
@@ -472,31 +855,31 @@ static wait_queue_head_t congestion_wqh[2] = {
 		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
 		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
 	};
-static atomic_t nr_bdi_congested[2];
+static atomic_t nr_wb_congested[2];
 
-void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
+void clear_wb_congested(struct bdi_writeback_congested *congested, int sync)
 {
-	enum bdi_state bit;
 	wait_queue_head_t *wqh = &congestion_wqh[sync];
+	enum wb_state bit;
 
-	bit = sync ? BDI_sync_congested : BDI_async_congested;
-	if (test_and_clear_bit(bit, &bdi->state))
-		atomic_dec(&nr_bdi_congested[sync]);
+	bit = sync ? WB_sync_congested : WB_async_congested;
+	if (test_and_clear_bit(bit, &congested->state))
+		atomic_dec(&nr_wb_congested[sync]);
 	smp_mb__after_atomic();
 	if (waitqueue_active(wqh))
 		wake_up(wqh);
 }
-EXPORT_SYMBOL(clear_bdi_congested);
+EXPORT_SYMBOL(clear_wb_congested);
 
-void set_bdi_congested(struct backing_dev_info *bdi, int sync)
+void set_wb_congested(struct bdi_writeback_congested *congested, int sync)
 {
-	enum bdi_state bit;
+	enum wb_state bit;
 
-	bit = sync ? BDI_sync_congested : BDI_async_congested;
-	if (!test_and_set_bit(bit, &bdi->state))
-		atomic_inc(&nr_bdi_congested[sync]);
+	bit = sync ? WB_sync_congested : WB_async_congested;
+	if (!test_and_set_bit(bit, &congested->state))
+		atomic_inc(&nr_wb_congested[sync]);
 }
-EXPORT_SYMBOL(set_bdi_congested);
+EXPORT_SYMBOL(set_wb_congested);
 
 /**
  * congestion_wait - wait for a backing_dev to become uncongested
@@ -555,7 +938,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
 	 * encountered in the current zone, yield if necessary instead
 	 * of sleeping on the congestion queue
 	 */
-	if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
+	if (atomic_read(&nr_wb_congested[sync]) == 0 ||
 	    !test_bit(ZONE_CONGESTED, &zone->flags)) {
 		cond_resched();
 
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 4a3907cf79f89a..b8a5bc66b0c09b 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -115,7 +115,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 	case POSIX_FADV_NOREUSE:
 		break;
 	case POSIX_FADV_DONTNEED:
-		if (!bdi_write_congested(bdi))
+		if (!inode_write_congested(mapping->host))
 			__filemap_fdatawrite_range(mapping, offset, endbyte,
 						   WB_SYNC_NONE);
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 6bf5e42d560a46..bfc1ab053b1224 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -100,6 +100,7 @@
  *    ->tree_lock		(page_remove_rmap->set_page_dirty)
  *    bdi.wb->list_lock		(page_remove_rmap->set_page_dirty)
  *    ->inode->i_lock		(page_remove_rmap->set_page_dirty)
+ *    ->memcg->move_lock	(page_remove_rmap->mem_cgroup_begin_page_stat)
  *    bdi.wb->list_lock		(zap_pte_range->set_page_dirty)
  *    ->inode->i_lock		(zap_pte_range->set_page_dirty)
  *    ->private_lock		(zap_pte_range->__set_page_dirty_buffers)
@@ -174,9 +175,11 @@ static void page_cache_tree_delete(struct address_space *mapping,
 /*
  * Delete a page from the page cache and free it. Caller has to make
  * sure the page is locked and that nobody else uses it - or that usage
- * is safe.  The caller must hold the mapping's tree_lock.
+ * is safe.  The caller must hold the mapping's tree_lock and
+ * mem_cgroup_begin_page_stat().
  */
-void __delete_from_page_cache(struct page *page, void *shadow)
+void __delete_from_page_cache(struct page *page, void *shadow,
+			      struct mem_cgroup *memcg)
 {
 	struct address_space *mapping = page->mapping;
 
@@ -210,7 +213,8 @@ void __delete_from_page_cache(struct page *page, void *shadow)
 	 * anyway will be cleared before returning page into buddy allocator.
 	 */
 	if (WARN_ON_ONCE(PageDirty(page)))
-		account_page_cleaned(page, mapping);
+		account_page_cleaned(page, mapping, memcg,
+				     inode_to_wb(mapping->host));
 }
 
 /**
@@ -224,14 +228,20 @@ void __delete_from_page_cache(struct page *page, void *shadow)
 void delete_from_page_cache(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
+	struct mem_cgroup *memcg;
+	unsigned long flags;
+
 	void (*freepage)(struct page *);
 
 	BUG_ON(!PageLocked(page));
 
 	freepage = mapping->a_ops->freepage;
-	spin_lock_irq(&mapping->tree_lock);
-	__delete_from_page_cache(page, NULL);
-	spin_unlock_irq(&mapping->tree_lock);
+
+	memcg = mem_cgroup_begin_page_stat(page);
+	spin_lock_irqsave(&mapping->tree_lock, flags);
+	__delete_from_page_cache(page, NULL, memcg);
+	spin_unlock_irqrestore(&mapping->tree_lock, flags);
+	mem_cgroup_end_page_stat(memcg);
 
 	if (freepage)
 		freepage(page);
@@ -281,7 +291,9 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
 	if (!mapping_cap_writeback_dirty(mapping))
 		return 0;
 
+	wbc_attach_fdatawrite_inode(&wbc, mapping->host);
 	ret = do_writepages(mapping, &wbc);
+	wbc_detach_inode(&wbc);
 	return ret;
 }
 
@@ -470,6 +482,8 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
 	if (!error) {
 		struct address_space *mapping = old->mapping;
 		void (*freepage)(struct page *);
+		struct mem_cgroup *memcg;
+		unsigned long flags;
 
 		pgoff_t offset = old->index;
 		freepage = mapping->a_ops->freepage;
@@ -478,15 +492,17 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
 		new->mapping = mapping;
 		new->index = offset;
 
-		spin_lock_irq(&mapping->tree_lock);
-		__delete_from_page_cache(old, NULL);
+		memcg = mem_cgroup_begin_page_stat(old);
+		spin_lock_irqsave(&mapping->tree_lock, flags);
+		__delete_from_page_cache(old, NULL, memcg);
 		error = radix_tree_insert(&mapping->page_tree, offset, new);
 		BUG_ON(error);
 		mapping->nrpages++;
 		__inc_zone_page_state(new, NR_FILE_PAGES);
 		if (PageSwapBacked(new))
 			__inc_zone_page_state(new, NR_SHMEM);
-		spin_unlock_irq(&mapping->tree_lock);
+		spin_unlock_irqrestore(&mapping->tree_lock, flags);
+		mem_cgroup_end_page_stat(memcg);
 		mem_cgroup_migrate(old, new, true);
 		radix_tree_preload_end();
 		if (freepage)
diff --git a/mm/madvise.c b/mm/madvise.c
index d551475517bfd8..64bb8a22110c23 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -17,6 +17,7 @@
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 14c2f2017e37cc..f816d91c643b7e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -77,6 +77,7 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
 
 #define MEM_CGROUP_RECLAIM_RETRIES	5
 static struct mem_cgroup *root_mem_cgroup __read_mostly;
+struct cgroup_subsys_state *mem_cgroup_root_css __read_mostly;
 
 /* Whether the swap controller is active */
 #ifdef CONFIG_MEMCG_SWAP
@@ -90,6 +91,7 @@ static const char * const mem_cgroup_stat_names[] = {
 	"rss",
 	"rss_huge",
 	"mapped_file",
+	"dirty",
 	"writeback",
 	"swap",
 };
@@ -322,11 +324,6 @@ struct mem_cgroup {
 	 * percpu counter.
 	 */
 	struct mem_cgroup_stat_cpu __percpu *stat;
-	/*
-	 * used when a cpu is offlined or other synchronizations
-	 * See mem_cgroup_read_stat().
-	 */
-	struct mem_cgroup_stat_cpu nocpu_base;
 	spinlock_t pcp_counter_lock;
 
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
@@ -346,6 +343,11 @@ struct mem_cgroup {
 	atomic_t	numainfo_updating;
 #endif
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+	struct list_head cgwb_list;
+	struct wb_domain cgwb_domain;
+#endif
+
 	/* List of events which userspace want to receive */
 	struct list_head event_list;
 	spinlock_t event_list_lock;
@@ -596,6 +598,39 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
 	return &memcg->css;
 }
 
+/**
+ * mem_cgroup_css_from_page - css of the memcg associated with a page
+ * @page: page of interest
+ *
+ * If memcg is bound to the default hierarchy, css of the memcg associated
+ * with @page is returned.  The returned css remains associated with @page
+ * until it is released.
+ *
+ * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
+ * is returned.
+ *
+ * XXX: The above description of behavior on the default hierarchy isn't
+ * strictly true yet as replace_page_cache_page() can modify the
+ * association before @page is released even on the default hierarchy;
+ * however, the current and planned usages don't mix the the two functions
+ * and replace_page_cache_page() will soon be updated to make the invariant
+ * actually true.
+ */
+struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
+{
+	struct mem_cgroup *memcg;
+
+	rcu_read_lock();
+
+	memcg = page->mem_cgroup;
+
+	if (!memcg || !cgroup_on_dfl(memcg->css.cgroup))
+		memcg = root_mem_cgroup;
+
+	rcu_read_unlock();
+	return &memcg->css;
+}
+
 static struct mem_cgroup_per_zone *
 mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
 {
@@ -795,15 +830,8 @@ static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
 	long val = 0;
 	int cpu;
 
-	get_online_cpus();
-	for_each_online_cpu(cpu)
+	for_each_possible_cpu(cpu)
 		val += per_cpu(memcg->stat->count[idx], cpu);
-#ifdef CONFIG_HOTPLUG_CPU
-	spin_lock(&memcg->pcp_counter_lock);
-	val += memcg->nocpu_base.count[idx];
-	spin_unlock(&memcg->pcp_counter_lock);
-#endif
-	put_online_cpus();
 	return val;
 }
 
@@ -813,15 +841,8 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
 	unsigned long val = 0;
 	int cpu;
 
-	get_online_cpus();
-	for_each_online_cpu(cpu)
+	for_each_possible_cpu(cpu)
 		val += per_cpu(memcg->stat->events[idx], cpu);
-#ifdef CONFIG_HOTPLUG_CPU
-	spin_lock(&memcg->pcp_counter_lock);
-	val += memcg->nocpu_base.events[idx];
-	spin_unlock(&memcg->pcp_counter_lock);
-#endif
-	put_online_cpus();
 	return val;
 }
 
@@ -2011,6 +2032,7 @@ again:
 
 	return memcg;
 }
+EXPORT_SYMBOL(mem_cgroup_begin_page_stat);
 
 /**
  * mem_cgroup_end_page_stat - finish a page state statistics transaction
@@ -2029,6 +2051,7 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
 
 	rcu_read_unlock();
 }
+EXPORT_SYMBOL(mem_cgroup_end_page_stat);
 
 /**
  * mem_cgroup_update_page_stat - update page state statistics
@@ -2169,37 +2192,12 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
 	mutex_unlock(&percpu_charge_mutex);
 }
 
-/*
- * This function drains percpu counter value from DEAD cpu and
- * move it to local cpu. Note that this function can be preempted.
- */
-static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
-{
-	int i;
-
-	spin_lock(&memcg->pcp_counter_lock);
-	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
-		long x = per_cpu(memcg->stat->count[i], cpu);
-
-		per_cpu(memcg->stat->count[i], cpu) = 0;
-		memcg->nocpu_base.count[i] += x;
-	}
-	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
-		unsigned long x = per_cpu(memcg->stat->events[i], cpu);
-
-		per_cpu(memcg->stat->events[i], cpu) = 0;
-		memcg->nocpu_base.events[i] += x;
-	}
-	spin_unlock(&memcg->pcp_counter_lock);
-}
-
 static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
 					unsigned long action,
 					void *hcpu)
 {
 	int cpu = (unsigned long)hcpu;
 	struct memcg_stock_pcp *stock;
-	struct mem_cgroup *iter;
 
 	if (action == CPU_ONLINE)
 		return NOTIFY_OK;
@@ -2207,9 +2205,6 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
 		return NOTIFY_OK;
 
-	for_each_mem_cgroup(iter)
-		mem_cgroup_drain_pcp_counter(iter, cpu);
-
 	stock = &per_cpu(memcg_stock, cpu);
 	drain_stock(stock);
 	return NOTIFY_OK;
@@ -3993,6 +3988,98 @@ static void memcg_destroy_kmem(struct mem_cgroup *memcg)
 }
 #endif
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg)
+{
+	return &memcg->cgwb_list;
+}
+
+static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
+{
+	return wb_domain_init(&memcg->cgwb_domain, gfp);
+}
+
+static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
+{
+	wb_domain_exit(&memcg->cgwb_domain);
+}
+
+static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
+{
+	wb_domain_size_changed(&memcg->cgwb_domain);
+}
+
+struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+
+	if (!memcg->css.parent)
+		return NULL;
+
+	return &memcg->cgwb_domain;
+}
+
+/**
+ * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
+ * @wb: bdi_writeback in question
+ * @pavail: out parameter for number of available pages
+ * @pdirty: out parameter for number of dirty pages
+ * @pwriteback: out parameter for number of pages under writeback
+ *
+ * Determine the numbers of available, dirty, and writeback pages in @wb's
+ * memcg.  Dirty and writeback are self-explanatory.  Available is a bit
+ * more involved.
+ *
+ * A memcg's headroom is "min(max, high) - used".  The available memory is
+ * calculated as the lowest headroom of itself and the ancestors plus the
+ * number of pages already being used for file pages.  Note that this
+ * doesn't consider the actual amount of available memory in the system.
+ * The caller should further cap *@pavail accordingly.
+ */
+void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
+			 unsigned long *pdirty, unsigned long *pwriteback)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+	struct mem_cgroup *parent;
+	unsigned long head_room = PAGE_COUNTER_MAX;
+	unsigned long file_pages;
+
+	*pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY);
+
+	/* this should eventually include NR_UNSTABLE_NFS */
+	*pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+
+	file_pages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
+						    (1 << LRU_ACTIVE_FILE));
+	while ((parent = parent_mem_cgroup(memcg))) {
+		unsigned long ceiling = min(memcg->memory.limit, memcg->high);
+		unsigned long used = page_counter_read(&memcg->memory);
+
+		head_room = min(head_room, ceiling - min(ceiling, used));
+		memcg = parent;
+	}
+
+	*pavail = file_pages + head_room;
+}
+
+#else	/* CONFIG_CGROUP_WRITEBACK */
+
+static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
+{
+	return 0;
+}
+
+static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
+{
+}
+
+static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
+{
+}
+
+#endif	/* CONFIG_CGROUP_WRITEBACK */
+
 /*
  * DO NOT USE IN NEW FILES.
  *
@@ -4377,9 +4464,15 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
 	memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
 	if (!memcg->stat)
 		goto out_free;
+
+	if (memcg_wb_domain_init(memcg, GFP_KERNEL))
+		goto out_free_stat;
+
 	spin_lock_init(&memcg->pcp_counter_lock);
 	return memcg;
 
+out_free_stat:
+	free_percpu(memcg->stat);
 out_free:
 	kfree(memcg);
 	return NULL;
@@ -4406,6 +4499,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
 		free_mem_cgroup_per_zone_info(memcg, node);
 
 	free_percpu(memcg->stat);
+	memcg_wb_domain_exit(memcg);
 	kfree(memcg);
 }
 
@@ -4438,6 +4532,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 	/* root ? */
 	if (parent_css == NULL) {
 		root_mem_cgroup = memcg;
+		mem_cgroup_root_css = &memcg->css;
 		page_counter_init(&memcg->memory, NULL);
 		memcg->high = PAGE_COUNTER_MAX;
 		memcg->soft_limit = PAGE_COUNTER_MAX;
@@ -4456,7 +4551,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 #ifdef CONFIG_MEMCG_KMEM
 	memcg->kmemcg_id = -1;
 #endif
-
+#ifdef CONFIG_CGROUP_WRITEBACK
+	INIT_LIST_HEAD(&memcg->cgwb_list);
+#endif
 	return &memcg->css;
 
 free_out:
@@ -4544,6 +4641,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 	vmpressure_cleanup(&memcg->vmpressure);
 
 	memcg_deactivate_kmem(memcg);
+
+	wb_memcg_offline(memcg);
 }
 
 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
@@ -4577,6 +4676,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
 	memcg->low = 0;
 	memcg->high = PAGE_COUNTER_MAX;
 	memcg->soft_limit = PAGE_COUNTER_MAX;
+	memcg_wb_domain_size_changed(memcg);
 }
 
 #ifdef CONFIG_MMU
@@ -4746,6 +4846,7 @@ static int mem_cgroup_move_account(struct page *page,
 {
 	unsigned long flags;
 	int ret;
+	bool anon;
 
 	VM_BUG_ON(from == to);
 	VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -4771,15 +4872,33 @@ static int mem_cgroup_move_account(struct page *page,
 	if (page->mem_cgroup != from)
 		goto out_unlock;
 
+	anon = PageAnon(page);
+
 	spin_lock_irqsave(&from->move_lock, flags);
 
-	if (!PageAnon(page) && page_mapped(page)) {
+	if (!anon && page_mapped(page)) {
 		__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
 			       nr_pages);
 		__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
 			       nr_pages);
 	}
 
+	/*
+	 * move_lock grabbed above and caller set from->moving_account, so
+	 * mem_cgroup_update_page_stat() will serialize updates to PageDirty.
+	 * So mapping should be stable for dirty pages.
+	 */
+	if (!anon && PageDirty(page)) {
+		struct address_space *mapping = page_mapping(page);
+
+		if (mapping_cap_account_dirty(mapping)) {
+			__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY],
+				       nr_pages);
+			__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY],
+				       nr_pages);
+		}
+	}
+
 	if (PageWriteback(page)) {
 		__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
 			       nr_pages);
@@ -5295,6 +5414,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
 
 	memcg->high = high;
 
+	memcg_wb_domain_size_changed(memcg);
 	return nbytes;
 }
 
@@ -5327,6 +5447,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
 	if (err)
 		return err;
 
+	memcg_wb_domain_size_changed(memcg);
 	return nbytes;
 }
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index eb59f7eea50827..22cddd3e5de843 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -122,31 +122,31 @@ EXPORT_SYMBOL(laptop_mode);
 
 /* End of sysctl-exported parameters */
 
-unsigned long global_dirty_limit;
+struct wb_domain global_wb_domain;
 
-/*
- * Scale the writeback cache size proportional to the relative writeout speeds.
- *
- * We do this by keeping a floating proportion between BDIs, based on page
- * writeback completions [end_page_writeback()]. Those devices that write out
- * pages fastest will get the larger share, while the slower will get a smaller
- * share.
- *
- * We use page writeout completions because we are interested in getting rid of
- * dirty pages. Having them written out is the primary goal.
- *
- * We introduce a concept of time, a period over which we measure these events,
- * because demand can/will vary over time. The length of this period itself is
- * measured in page writeback completions.
- *
- */
-static struct fprop_global writeout_completions;
+/* consolidated parameters for balance_dirty_pages() and its subroutines */
+struct dirty_throttle_control {
+#ifdef CONFIG_CGROUP_WRITEBACK
+	struct wb_domain	*dom;
+	struct dirty_throttle_control *gdtc;	/* only set in memcg dtc's */
+#endif
+	struct bdi_writeback	*wb;
+	struct fprop_local_percpu *wb_completions;
 
-static void writeout_period(unsigned long t);
-/* Timer for aging of writeout_completions */
-static struct timer_list writeout_period_timer =
-		TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0);
-static unsigned long writeout_period_time = 0;
+	unsigned long		avail;		/* dirtyable */
+	unsigned long		dirty;		/* file_dirty + write + nfs */
+	unsigned long		thresh;		/* dirty threshold */
+	unsigned long		bg_thresh;	/* dirty background threshold */
+
+	unsigned long		wb_dirty;	/* per-wb counterparts */
+	unsigned long		wb_thresh;
+	unsigned long		wb_bg_thresh;
+
+	unsigned long		pos_ratio;
+};
+
+#define DTC_INIT_COMMON(__wb)	.wb = (__wb),				\
+				.wb_completions = &(__wb)->completions
 
 /*
  * Length of period for aging writeout fractions of bdis. This is an
@@ -155,6 +155,97 @@ static unsigned long writeout_period_time = 0;
  */
 #define VM_COMPLETIONS_PERIOD_LEN (3*HZ)
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+#define GDTC_INIT(__wb)		.dom = &global_wb_domain,		\
+				DTC_INIT_COMMON(__wb)
+#define GDTC_INIT_NO_WB		.dom = &global_wb_domain
+#define MDTC_INIT(__wb, __gdtc)	.dom = mem_cgroup_wb_domain(__wb),	\
+				.gdtc = __gdtc,				\
+				DTC_INIT_COMMON(__wb)
+
+static bool mdtc_valid(struct dirty_throttle_control *dtc)
+{
+	return dtc->dom;
+}
+
+static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
+{
+	return dtc->dom;
+}
+
+static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
+{
+	return mdtc->gdtc;
+}
+
+static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
+{
+	return &wb->memcg_completions;
+}
+
+static void wb_min_max_ratio(struct bdi_writeback *wb,
+			     unsigned long *minp, unsigned long *maxp)
+{
+	unsigned long this_bw = wb->avg_write_bandwidth;
+	unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
+	unsigned long long min = wb->bdi->min_ratio;
+	unsigned long long max = wb->bdi->max_ratio;
+
+	/*
+	 * @wb may already be clean by the time control reaches here and
+	 * the total may not include its bw.
+	 */
+	if (this_bw < tot_bw) {
+		if (min) {
+			min *= this_bw;
+			do_div(min, tot_bw);
+		}
+		if (max < 100) {
+			max *= this_bw;
+			do_div(max, tot_bw);
+		}
+	}
+
+	*minp = min;
+	*maxp = max;
+}
+
+#else	/* CONFIG_CGROUP_WRITEBACK */
+
+#define GDTC_INIT(__wb)		DTC_INIT_COMMON(__wb)
+#define GDTC_INIT_NO_WB
+#define MDTC_INIT(__wb, __gdtc)
+
+static bool mdtc_valid(struct dirty_throttle_control *dtc)
+{
+	return false;
+}
+
+static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
+{
+	return &global_wb_domain;
+}
+
+static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
+{
+	return NULL;
+}
+
+static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
+{
+	return NULL;
+}
+
+static void wb_min_max_ratio(struct bdi_writeback *wb,
+			     unsigned long *minp, unsigned long *maxp)
+{
+	*minp = wb->bdi->min_ratio;
+	*maxp = wb->bdi->max_ratio;
+}
+
+#endif	/* CONFIG_CGROUP_WRITEBACK */
+
 /*
  * In a memory zone, there is a certain amount of pages we consider
  * available for the page cache, which is essentially the number of
@@ -250,42 +341,88 @@ static unsigned long global_dirtyable_memory(void)
 	return x + 1;	/* Ensure that we never return 0 */
 }
 
-/*
- * global_dirty_limits - background-writeback and dirty-throttling thresholds
+/**
+ * domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain
+ * @dtc: dirty_throttle_control of interest
  *
- * Calculate the dirty thresholds based on sysctl parameters
- * - vm.dirty_background_ratio  or  vm.dirty_background_bytes
- * - vm.dirty_ratio             or  vm.dirty_bytes
- * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
+ * Calculate @dtc->thresh and ->bg_thresh considering
+ * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}.  The caller
+ * must ensure that @dtc->avail is set before calling this function.  The
+ * dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
  * real-time tasks.
  */
-void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
+static void domain_dirty_limits(struct dirty_throttle_control *dtc)
 {
-	const unsigned long available_memory = global_dirtyable_memory();
-	unsigned long background;
-	unsigned long dirty;
+	const unsigned long available_memory = dtc->avail;
+	struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc);
+	unsigned long bytes = vm_dirty_bytes;
+	unsigned long bg_bytes = dirty_background_bytes;
+	unsigned long ratio = vm_dirty_ratio;
+	unsigned long bg_ratio = dirty_background_ratio;
+	unsigned long thresh;
+	unsigned long bg_thresh;
 	struct task_struct *tsk;
 
-	if (vm_dirty_bytes)
-		dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
+	/* gdtc is !NULL iff @dtc is for memcg domain */
+	if (gdtc) {
+		unsigned long global_avail = gdtc->avail;
+
+		/*
+		 * The byte settings can't be applied directly to memcg
+		 * domains.  Convert them to ratios by scaling against
+		 * globally available memory.
+		 */
+		if (bytes)
+			ratio = min(DIV_ROUND_UP(bytes, PAGE_SIZE) * 100 /
+				    global_avail, 100UL);
+		if (bg_bytes)
+			bg_ratio = min(DIV_ROUND_UP(bg_bytes, PAGE_SIZE) * 100 /
+				       global_avail, 100UL);
+		bytes = bg_bytes = 0;
+	}
+
+	if (bytes)
+		thresh = DIV_ROUND_UP(bytes, PAGE_SIZE);
 	else
-		dirty = (vm_dirty_ratio * available_memory) / 100;
+		thresh = (ratio * available_memory) / 100;
 
-	if (dirty_background_bytes)
-		background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
+	if (bg_bytes)
+		bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE);
 	else
-		background = (dirty_background_ratio * available_memory) / 100;
+		bg_thresh = (bg_ratio * available_memory) / 100;
 
-	if (background >= dirty)
-		background = dirty / 2;
+	if (bg_thresh >= thresh)
+		bg_thresh = thresh / 2;
 	tsk = current;
 	if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
-		background += background / 4;
-		dirty += dirty / 4;
+		bg_thresh += bg_thresh / 4;
+		thresh += thresh / 4;
 	}
-	*pbackground = background;
-	*pdirty = dirty;
-	trace_global_dirty_state(background, dirty);
+	dtc->thresh = thresh;
+	dtc->bg_thresh = bg_thresh;
+
+	/* we should eventually report the domain in the TP */
+	if (!gdtc)
+		trace_global_dirty_state(bg_thresh, thresh);
+}
+
+/**
+ * global_dirty_limits - background-writeback and dirty-throttling thresholds
+ * @pbackground: out parameter for bg_thresh
+ * @pdirty: out parameter for thresh
+ *
+ * Calculate bg_thresh and thresh for global_wb_domain.  See
+ * domain_dirty_limits() for details.
+ */
+void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
+{
+	struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
+
+	gdtc.avail = global_dirtyable_memory();
+	domain_dirty_limits(&gdtc);
+
+	*pbackground = gdtc.bg_thresh;
+	*pdirty = gdtc.thresh;
 }
 
 /**
@@ -392,47 +529,52 @@ static unsigned long wp_next_time(unsigned long cur_time)
 	return cur_time;
 }
 
-/*
- * Increment the BDI's writeout completion count and the global writeout
- * completion count. Called from test_clear_page_writeback().
- */
-static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
+static void wb_domain_writeout_inc(struct wb_domain *dom,
+				   struct fprop_local_percpu *completions,
+				   unsigned int max_prop_frac)
 {
-	__inc_bdi_stat(bdi, BDI_WRITTEN);
-	__fprop_inc_percpu_max(&writeout_completions, &bdi->completions,
-			       bdi->max_prop_frac);
+	__fprop_inc_percpu_max(&dom->completions, completions,
+			       max_prop_frac);
 	/* First event after period switching was turned off? */
-	if (!unlikely(writeout_period_time)) {
+	if (!unlikely(dom->period_time)) {
 		/*
 		 * We can race with other __bdi_writeout_inc calls here but
 		 * it does not cause any harm since the resulting time when
 		 * timer will fire and what is in writeout_period_time will be
 		 * roughly the same.
 		 */
-		writeout_period_time = wp_next_time(jiffies);
-		mod_timer(&writeout_period_timer, writeout_period_time);
+		dom->period_time = wp_next_time(jiffies);
+		mod_timer(&dom->period_timer, dom->period_time);
 	}
 }
 
-void bdi_writeout_inc(struct backing_dev_info *bdi)
+/*
+ * Increment @wb's writeout completion count and the global writeout
+ * completion count. Called from test_clear_page_writeback().
+ */
+static inline void __wb_writeout_inc(struct bdi_writeback *wb)
 {
-	unsigned long flags;
+	struct wb_domain *cgdom;
 
-	local_irq_save(flags);
-	__bdi_writeout_inc(bdi);
-	local_irq_restore(flags);
+	__inc_wb_stat(wb, WB_WRITTEN);
+	wb_domain_writeout_inc(&global_wb_domain, &wb->completions,
+			       wb->bdi->max_prop_frac);
+
+	cgdom = mem_cgroup_wb_domain(wb);
+	if (cgdom)
+		wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb),
+				       wb->bdi->max_prop_frac);
 }
-EXPORT_SYMBOL_GPL(bdi_writeout_inc);
 
-/*
- * Obtain an accurate fraction of the BDI's portion.
- */
-static void bdi_writeout_fraction(struct backing_dev_info *bdi,
-		long *numerator, long *denominator)
+void wb_writeout_inc(struct bdi_writeback *wb)
 {
-	fprop_fraction_percpu(&writeout_completions, &bdi->completions,
-				numerator, denominator);
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__wb_writeout_inc(wb);
+	local_irq_restore(flags);
 }
+EXPORT_SYMBOL_GPL(wb_writeout_inc);
 
 /*
  * On idle system, we can be called long after we scheduled because we use
@@ -440,22 +582,46 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,
  */
 static void writeout_period(unsigned long t)
 {
-	int miss_periods = (jiffies - writeout_period_time) /
+	struct wb_domain *dom = (void *)t;
+	int miss_periods = (jiffies - dom->period_time) /
 						 VM_COMPLETIONS_PERIOD_LEN;
 
-	if (fprop_new_period(&writeout_completions, miss_periods + 1)) {
-		writeout_period_time = wp_next_time(writeout_period_time +
+	if (fprop_new_period(&dom->completions, miss_periods + 1)) {
+		dom->period_time = wp_next_time(dom->period_time +
 				miss_periods * VM_COMPLETIONS_PERIOD_LEN);
-		mod_timer(&writeout_period_timer, writeout_period_time);
+		mod_timer(&dom->period_timer, dom->period_time);
 	} else {
 		/*
 		 * Aging has zeroed all fractions. Stop wasting CPU on period
 		 * updates.
 		 */
-		writeout_period_time = 0;
+		dom->period_time = 0;
 	}
 }
 
+int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
+{
+	memset(dom, 0, sizeof(*dom));
+
+	spin_lock_init(&dom->lock);
+
+	init_timer_deferrable(&dom->period_timer);
+	dom->period_timer.function = writeout_period;
+	dom->period_timer.data = (unsigned long)dom;
+
+	dom->dirty_limit_tstamp = jiffies;
+
+	return fprop_global_init(&dom->completions, gfp);
+}
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+void wb_domain_exit(struct wb_domain *dom)
+{
+	del_timer_sync(&dom->period_timer);
+	fprop_global_destroy(&dom->completions);
+}
+#endif
+
 /*
  * bdi_min_ratio keeps the sum of the minimum dirty shares of all
  * registered backing devices, which, for obvious reasons, can not
@@ -510,17 +676,26 @@ static unsigned long dirty_freerun_ceiling(unsigned long thresh,
 	return (thresh + bg_thresh) / 2;
 }
 
-static unsigned long hard_dirty_limit(unsigned long thresh)
+static unsigned long hard_dirty_limit(struct wb_domain *dom,
+				      unsigned long thresh)
 {
-	return max(thresh, global_dirty_limit);
+	return max(thresh, dom->dirty_limit);
+}
+
+/* memory available to a memcg domain is capped by system-wide clean memory */
+static void mdtc_cap_avail(struct dirty_throttle_control *mdtc)
+{
+	struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc);
+	unsigned long clean = gdtc->avail - min(gdtc->avail, gdtc->dirty);
+
+	mdtc->avail = min(mdtc->avail, clean);
 }
 
 /**
- * bdi_dirty_limit - @bdi's share of dirty throttling threshold
- * @bdi: the backing_dev_info to query
- * @dirty: global dirty limit in pages
+ * __wb_calc_thresh - @wb's share of dirty throttling threshold
+ * @dtc: dirty_throttle_context of interest
  *
- * Returns @bdi's dirty limit in pages. The term "dirty" in the context of
+ * Returns @wb's dirty limit in pages. The term "dirty" in the context of
  * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
  *
  * Note that balance_dirty_pages() will only seriously take it as a hard limit
@@ -528,34 +703,47 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
  * control. For example, when the device is completely stalled due to some error
  * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key.
  * In the other normal situations, it acts more gently by throttling the tasks
- * more (rather than completely block them) when the bdi dirty pages go high.
+ * more (rather than completely block them) when the wb dirty pages go high.
  *
  * It allocates high/low dirty limits to fast/slow devices, in order to prevent
  * - starving fast devices
  * - piling up dirty pages (that will take long time to sync) on slow devices
  *
- * The bdi's share of dirty limit will be adapting to its throughput and
+ * The wb's share of dirty limit will be adapting to its throughput and
  * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
  */
-unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
+static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
 {
-	u64 bdi_dirty;
+	struct wb_domain *dom = dtc_dom(dtc);
+	unsigned long thresh = dtc->thresh;
+	u64 wb_thresh;
 	long numerator, denominator;
+	unsigned long wb_min_ratio, wb_max_ratio;
 
 	/*
-	 * Calculate this BDI's share of the dirty ratio.
+	 * Calculate this BDI's share of the thresh ratio.
 	 */
-	bdi_writeout_fraction(bdi, &numerator, &denominator);
+	fprop_fraction_percpu(&dom->completions, dtc->wb_completions,
+			      &numerator, &denominator);
+
+	wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100;
+	wb_thresh *= numerator;
+	do_div(wb_thresh, denominator);
 
-	bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
-	bdi_dirty *= numerator;
-	do_div(bdi_dirty, denominator);
+	wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio);
 
-	bdi_dirty += (dirty * bdi->min_ratio) / 100;
-	if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
-		bdi_dirty = dirty * bdi->max_ratio / 100;
+	wb_thresh += (thresh * wb_min_ratio) / 100;
+	if (wb_thresh > (thresh * wb_max_ratio) / 100)
+		wb_thresh = thresh * wb_max_ratio / 100;
 
-	return bdi_dirty;
+	return wb_thresh;
+}
+
+unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
+{
+	struct dirty_throttle_control gdtc = { GDTC_INIT(wb),
+					       .thresh = thresh };
+	return __wb_calc_thresh(&gdtc);
 }
 
 /*
@@ -594,7 +782,7 @@ static long long pos_ratio_polynom(unsigned long setpoint,
  *
  * (o) global/bdi setpoints
  *
- * We want the dirty pages be balanced around the global/bdi setpoints.
+ * We want the dirty pages be balanced around the global/wb setpoints.
  * When the number of dirty pages is higher/lower than the setpoint, the
  * dirty position control ratio (and hence task dirty ratelimit) will be
  * decreased/increased to bring the dirty pages back to the setpoint.
@@ -604,8 +792,8 @@ static long long pos_ratio_polynom(unsigned long setpoint,
  *     if (dirty < setpoint) scale up   pos_ratio
  *     if (dirty > setpoint) scale down pos_ratio
  *
- *     if (bdi_dirty < bdi_setpoint) scale up   pos_ratio
- *     if (bdi_dirty > bdi_setpoint) scale down pos_ratio
+ *     if (wb_dirty < wb_setpoint) scale up   pos_ratio
+ *     if (wb_dirty > wb_setpoint) scale down pos_ratio
  *
  *     task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT
  *
@@ -630,7 +818,7 @@ static long long pos_ratio_polynom(unsigned long setpoint,
  *   0 +------------.------------------.----------------------*------------->
  *           freerun^          setpoint^                 limit^   dirty pages
  *
- * (o) bdi control line
+ * (o) wb control line
  *
  *     ^ pos_ratio
  *     |
@@ -656,33 +844,32 @@ static long long pos_ratio_polynom(unsigned long setpoint,
  *     |                      .                           .
  *     |                      .                             .
  *   0 +----------------------.-------------------------------.------------->
- *                bdi_setpoint^                    x_intercept^
+ *                wb_setpoint^                    x_intercept^
  *
- * The bdi control line won't drop below pos_ratio=1/4, so that bdi_dirty can
+ * The wb control line won't drop below pos_ratio=1/4, so that wb_dirty can
  * be smoothly throttled down to normal if it starts high in situations like
  * - start writing to a slow SD card and a fast disk at the same time. The SD
- *   card's bdi_dirty may rush to many times higher than bdi_setpoint.
- * - the bdi dirty thresh drops quickly due to change of JBOD workload
+ *   card's wb_dirty may rush to many times higher than wb_setpoint.
+ * - the wb dirty thresh drops quickly due to change of JBOD workload
  */
-static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
-					unsigned long thresh,
-					unsigned long bg_thresh,
-					unsigned long dirty,
-					unsigned long bdi_thresh,
-					unsigned long bdi_dirty)
-{
-	unsigned long write_bw = bdi->avg_write_bandwidth;
-	unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
-	unsigned long limit = hard_dirty_limit(thresh);
+static void wb_position_ratio(struct dirty_throttle_control *dtc)
+{
+	struct bdi_writeback *wb = dtc->wb;
+	unsigned long write_bw = wb->avg_write_bandwidth;
+	unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
+	unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
+	unsigned long wb_thresh = dtc->wb_thresh;
 	unsigned long x_intercept;
 	unsigned long setpoint;		/* dirty pages' target balance point */
-	unsigned long bdi_setpoint;
+	unsigned long wb_setpoint;
 	unsigned long span;
 	long long pos_ratio;		/* for scaling up/down the rate limit */
 	long x;
 
-	if (unlikely(dirty >= limit))
-		return 0;
+	dtc->pos_ratio = 0;
+
+	if (unlikely(dtc->dirty >= limit))
+		return;
 
 	/*
 	 * global setpoint
@@ -690,165 +877,167 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
 	 * See comment for pos_ratio_polynom().
 	 */
 	setpoint = (freerun + limit) / 2;
-	pos_ratio = pos_ratio_polynom(setpoint, dirty, limit);
+	pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit);
 
 	/*
 	 * The strictlimit feature is a tool preventing mistrusted filesystems
 	 * from growing a large number of dirty pages before throttling. For
-	 * such filesystems balance_dirty_pages always checks bdi counters
-	 * against bdi limits. Even if global "nr_dirty" is under "freerun".
+	 * such filesystems balance_dirty_pages always checks wb counters
+	 * against wb limits. Even if global "nr_dirty" is under "freerun".
 	 * This is especially important for fuse which sets bdi->max_ratio to
 	 * 1% by default. Without strictlimit feature, fuse writeback may
 	 * consume arbitrary amount of RAM because it is accounted in
 	 * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
 	 *
-	 * Here, in bdi_position_ratio(), we calculate pos_ratio based on
-	 * two values: bdi_dirty and bdi_thresh. Let's consider an example:
+	 * Here, in wb_position_ratio(), we calculate pos_ratio based on
+	 * two values: wb_dirty and wb_thresh. Let's consider an example:
 	 * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global
 	 * limits are set by default to 10% and 20% (background and throttle).
-	 * Then bdi_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
-	 * bdi_dirty_limit(bdi, bg_thresh) is about ~4K pages. bdi_setpoint is
-	 * about ~6K pages (as the average of background and throttle bdi
+	 * Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
+	 * wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is
+	 * about ~6K pages (as the average of background and throttle wb
 	 * limits). The 3rd order polynomial will provide positive feedback if
-	 * bdi_dirty is under bdi_setpoint and vice versa.
+	 * wb_dirty is under wb_setpoint and vice versa.
 	 *
 	 * Note, that we cannot use global counters in these calculations
-	 * because we want to throttle process writing to a strictlimit BDI
+	 * because we want to throttle process writing to a strictlimit wb
 	 * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB
 	 * in the example above).
 	 */
-	if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
-		long long bdi_pos_ratio;
-		unsigned long bdi_bg_thresh;
+	if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
+		long long wb_pos_ratio;
 
-		if (bdi_dirty < 8)
-			return min_t(long long, pos_ratio * 2,
-				     2 << RATELIMIT_CALC_SHIFT);
+		if (dtc->wb_dirty < 8) {
+			dtc->pos_ratio = min_t(long long, pos_ratio * 2,
+					   2 << RATELIMIT_CALC_SHIFT);
+			return;
+		}
 
-		if (bdi_dirty >= bdi_thresh)
-			return 0;
+		if (dtc->wb_dirty >= wb_thresh)
+			return;
 
-		bdi_bg_thresh = div_u64((u64)bdi_thresh * bg_thresh, thresh);
-		bdi_setpoint = dirty_freerun_ceiling(bdi_thresh,
-						     bdi_bg_thresh);
+		wb_setpoint = dirty_freerun_ceiling(wb_thresh,
+						    dtc->wb_bg_thresh);
 
-		if (bdi_setpoint == 0 || bdi_setpoint == bdi_thresh)
-			return 0;
+		if (wb_setpoint == 0 || wb_setpoint == wb_thresh)
+			return;
 
-		bdi_pos_ratio = pos_ratio_polynom(bdi_setpoint, bdi_dirty,
-						  bdi_thresh);
+		wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty,
+						 wb_thresh);
 
 		/*
-		 * Typically, for strictlimit case, bdi_setpoint << setpoint
-		 * and pos_ratio >> bdi_pos_ratio. In the other words global
+		 * Typically, for strictlimit case, wb_setpoint << setpoint
+		 * and pos_ratio >> wb_pos_ratio. In the other words global
 		 * state ("dirty") is not limiting factor and we have to
-		 * make decision based on bdi counters. But there is an
+		 * make decision based on wb counters. But there is an
 		 * important case when global pos_ratio should get precedence:
 		 * global limits are exceeded (e.g. due to activities on other
-		 * BDIs) while given strictlimit BDI is below limit.
+		 * wb's) while given strictlimit wb is below limit.
 		 *
-		 * "pos_ratio * bdi_pos_ratio" would work for the case above,
+		 * "pos_ratio * wb_pos_ratio" would work for the case above,
 		 * but it would look too non-natural for the case of all
-		 * activity in the system coming from a single strictlimit BDI
+		 * activity in the system coming from a single strictlimit wb
 		 * with bdi->max_ratio == 100%.
 		 *
 		 * Note that min() below somewhat changes the dynamics of the
 		 * control system. Normally, pos_ratio value can be well over 3
-		 * (when globally we are at freerun and bdi is well below bdi
+		 * (when globally we are at freerun and wb is well below wb
 		 * setpoint). Now the maximum pos_ratio in the same situation
 		 * is 2. We might want to tweak this if we observe the control
 		 * system is too slow to adapt.
 		 */
-		return min(pos_ratio, bdi_pos_ratio);
+		dtc->pos_ratio = min(pos_ratio, wb_pos_ratio);
+		return;
 	}
 
 	/*
 	 * We have computed basic pos_ratio above based on global situation. If
-	 * the bdi is over/under its share of dirty pages, we want to scale
+	 * the wb is over/under its share of dirty pages, we want to scale
 	 * pos_ratio further down/up. That is done by the following mechanism.
 	 */
 
 	/*
-	 * bdi setpoint
+	 * wb setpoint
 	 *
-	 *        f(bdi_dirty) := 1.0 + k * (bdi_dirty - bdi_setpoint)
+	 *        f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint)
 	 *
-	 *                        x_intercept - bdi_dirty
+	 *                        x_intercept - wb_dirty
 	 *                     := --------------------------
-	 *                        x_intercept - bdi_setpoint
+	 *                        x_intercept - wb_setpoint
 	 *
-	 * The main bdi control line is a linear function that subjects to
+	 * The main wb control line is a linear function that subjects to
 	 *
-	 * (1) f(bdi_setpoint) = 1.0
-	 * (2) k = - 1 / (8 * write_bw)  (in single bdi case)
-	 *     or equally: x_intercept = bdi_setpoint + 8 * write_bw
+	 * (1) f(wb_setpoint) = 1.0
+	 * (2) k = - 1 / (8 * write_bw)  (in single wb case)
+	 *     or equally: x_intercept = wb_setpoint + 8 * write_bw
 	 *
-	 * For single bdi case, the dirty pages are observed to fluctuate
+	 * For single wb case, the dirty pages are observed to fluctuate
 	 * regularly within range
-	 *        [bdi_setpoint - write_bw/2, bdi_setpoint + write_bw/2]
+	 *        [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2]
 	 * for various filesystems, where (2) can yield in a reasonable 12.5%
 	 * fluctuation range for pos_ratio.
 	 *
-	 * For JBOD case, bdi_thresh (not bdi_dirty!) could fluctuate up to its
+	 * For JBOD case, wb_thresh (not wb_dirty!) could fluctuate up to its
 	 * own size, so move the slope over accordingly and choose a slope that
-	 * yields 100% pos_ratio fluctuation on suddenly doubled bdi_thresh.
+	 * yields 100% pos_ratio fluctuation on suddenly doubled wb_thresh.
 	 */
-	if (unlikely(bdi_thresh > thresh))
-		bdi_thresh = thresh;
+	if (unlikely(wb_thresh > dtc->thresh))
+		wb_thresh = dtc->thresh;
 	/*
-	 * It's very possible that bdi_thresh is close to 0 not because the
+	 * It's very possible that wb_thresh is close to 0 not because the
 	 * device is slow, but that it has remained inactive for long time.
 	 * Honour such devices a reasonable good (hopefully IO efficient)
 	 * threshold, so that the occasional writes won't be blocked and active
 	 * writes can rampup the threshold quickly.
 	 */
-	bdi_thresh = max(bdi_thresh, (limit - dirty) / 8);
+	wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8);
 	/*
-	 * scale global setpoint to bdi's:
-	 *	bdi_setpoint = setpoint * bdi_thresh / thresh
+	 * scale global setpoint to wb's:
+	 *	wb_setpoint = setpoint * wb_thresh / thresh
 	 */
-	x = div_u64((u64)bdi_thresh << 16, thresh | 1);
-	bdi_setpoint = setpoint * (u64)x >> 16;
+	x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1);
+	wb_setpoint = setpoint * (u64)x >> 16;
 	/*
-	 * Use span=(8*write_bw) in single bdi case as indicated by
-	 * (thresh - bdi_thresh ~= 0) and transit to bdi_thresh in JBOD case.
+	 * Use span=(8*write_bw) in single wb case as indicated by
+	 * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case.
 	 *
-	 *        bdi_thresh                    thresh - bdi_thresh
-	 * span = ---------- * (8 * write_bw) + ------------------- * bdi_thresh
-	 *          thresh                            thresh
+	 *        wb_thresh                    thresh - wb_thresh
+	 * span = --------- * (8 * write_bw) + ------------------ * wb_thresh
+	 *         thresh                           thresh
 	 */
-	span = (thresh - bdi_thresh + 8 * write_bw) * (u64)x >> 16;
-	x_intercept = bdi_setpoint + span;
+	span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16;
+	x_intercept = wb_setpoint + span;
 
-	if (bdi_dirty < x_intercept - span / 4) {
-		pos_ratio = div64_u64(pos_ratio * (x_intercept - bdi_dirty),
-				      (x_intercept - bdi_setpoint) | 1);
+	if (dtc->wb_dirty < x_intercept - span / 4) {
+		pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty),
+				      (x_intercept - wb_setpoint) | 1);
 	} else
 		pos_ratio /= 4;
 
 	/*
-	 * bdi reserve area, safeguard against dirty pool underrun and disk idle
+	 * wb reserve area, safeguard against dirty pool underrun and disk idle
 	 * It may push the desired control point of global dirty pages higher
 	 * than setpoint.
 	 */
-	x_intercept = bdi_thresh / 2;
-	if (bdi_dirty < x_intercept) {
-		if (bdi_dirty > x_intercept / 8)
-			pos_ratio = div_u64(pos_ratio * x_intercept, bdi_dirty);
+	x_intercept = wb_thresh / 2;
+	if (dtc->wb_dirty < x_intercept) {
+		if (dtc->wb_dirty > x_intercept / 8)
+			pos_ratio = div_u64(pos_ratio * x_intercept,
+					    dtc->wb_dirty);
 		else
 			pos_ratio *= 8;
 	}
 
-	return pos_ratio;
+	dtc->pos_ratio = pos_ratio;
 }
 
-static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
-				       unsigned long elapsed,
-				       unsigned long written)
+static void wb_update_write_bandwidth(struct bdi_writeback *wb,
+				      unsigned long elapsed,
+				      unsigned long written)
 {
 	const unsigned long period = roundup_pow_of_two(3 * HZ);
-	unsigned long avg = bdi->avg_write_bandwidth;
-	unsigned long old = bdi->write_bandwidth;
+	unsigned long avg = wb->avg_write_bandwidth;
+	unsigned long old = wb->write_bandwidth;
 	u64 bw;
 
 	/*
@@ -861,14 +1050,14 @@ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
 	 * @written may have decreased due to account_page_redirty().
 	 * Avoid underflowing @bw calculation.
 	 */
-	bw = written - min(written, bdi->written_stamp);
+	bw = written - min(written, wb->written_stamp);
 	bw *= HZ;
 	if (unlikely(elapsed > period)) {
 		do_div(bw, elapsed);
 		avg = bw;
 		goto out;
 	}
-	bw += (u64)bdi->write_bandwidth * (period - elapsed);
+	bw += (u64)wb->write_bandwidth * (period - elapsed);
 	bw >>= ilog2(period);
 
 	/*
@@ -881,21 +1070,22 @@ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
 		avg += (old - avg) >> 3;
 
 out:
-	bdi->write_bandwidth = bw;
-	bdi->avg_write_bandwidth = avg;
+	/* keep avg > 0 to guarantee that tot > 0 if there are dirty wbs */
+	avg = max(avg, 1LU);
+	if (wb_has_dirty_io(wb)) {
+		long delta = avg - wb->avg_write_bandwidth;
+		WARN_ON_ONCE(atomic_long_add_return(delta,
+					&wb->bdi->tot_write_bandwidth) <= 0);
+	}
+	wb->write_bandwidth = bw;
+	wb->avg_write_bandwidth = avg;
 }
 
-/*
- * The global dirtyable memory and dirty threshold could be suddenly knocked
- * down by a large amount (eg. on the startup of KVM in a swapless system).
- * This may throw the system into deep dirty exceeded state and throttle
- * heavy/light dirtiers alike. To retain good responsiveness, maintain
- * global_dirty_limit for tracking slowly down to the knocked down dirty
- * threshold.
- */
-static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
+static void update_dirty_limit(struct dirty_throttle_control *dtc)
 {
-	unsigned long limit = global_dirty_limit;
+	struct wb_domain *dom = dtc_dom(dtc);
+	unsigned long thresh = dtc->thresh;
+	unsigned long limit = dom->dirty_limit;
 
 	/*
 	 * Follow up in one step.
@@ -908,63 +1098,57 @@ static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
 	/*
 	 * Follow down slowly. Use the higher one as the target, because thresh
 	 * may drop below dirty. This is exactly the reason to introduce
-	 * global_dirty_limit which is guaranteed to lie above the dirty pages.
+	 * dom->dirty_limit which is guaranteed to lie above the dirty pages.
 	 */
-	thresh = max(thresh, dirty);
+	thresh = max(thresh, dtc->dirty);
 	if (limit > thresh) {
 		limit -= (limit - thresh) >> 5;
 		goto update;
 	}
 	return;
 update:
-	global_dirty_limit = limit;
+	dom->dirty_limit = limit;
 }
 
-static void global_update_bandwidth(unsigned long thresh,
-				    unsigned long dirty,
+static void domain_update_bandwidth(struct dirty_throttle_control *dtc,
 				    unsigned long now)
 {
-	static DEFINE_SPINLOCK(dirty_lock);
-	static unsigned long update_time = INITIAL_JIFFIES;
+	struct wb_domain *dom = dtc_dom(dtc);
 
 	/*
 	 * check locklessly first to optimize away locking for the most time
 	 */
-	if (time_before(now, update_time + BANDWIDTH_INTERVAL))
+	if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL))
 		return;
 
-	spin_lock(&dirty_lock);
-	if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) {
-		update_dirty_limit(thresh, dirty);
-		update_time = now;
+	spin_lock(&dom->lock);
+	if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) {
+		update_dirty_limit(dtc);
+		dom->dirty_limit_tstamp = now;
 	}
-	spin_unlock(&dirty_lock);
+	spin_unlock(&dom->lock);
 }
 
 /*
- * Maintain bdi->dirty_ratelimit, the base dirty throttle rate.
+ * Maintain wb->dirty_ratelimit, the base dirty throttle rate.
  *
- * Normal bdi tasks will be curbed at or below it in long term.
+ * Normal wb tasks will be curbed at or below it in long term.
  * Obviously it should be around (write_bw / N) when there are N dd tasks.
  */
-static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
-				       unsigned long thresh,
-				       unsigned long bg_thresh,
-				       unsigned long dirty,
-				       unsigned long bdi_thresh,
-				       unsigned long bdi_dirty,
-				       unsigned long dirtied,
-				       unsigned long elapsed)
-{
-	unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
-	unsigned long limit = hard_dirty_limit(thresh);
+static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
+				      unsigned long dirtied,
+				      unsigned long elapsed)
+{
+	struct bdi_writeback *wb = dtc->wb;
+	unsigned long dirty = dtc->dirty;
+	unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
+	unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
 	unsigned long setpoint = (freerun + limit) / 2;
-	unsigned long write_bw = bdi->avg_write_bandwidth;
-	unsigned long dirty_ratelimit = bdi->dirty_ratelimit;
+	unsigned long write_bw = wb->avg_write_bandwidth;
+	unsigned long dirty_ratelimit = wb->dirty_ratelimit;
 	unsigned long dirty_rate;
 	unsigned long task_ratelimit;
 	unsigned long balanced_dirty_ratelimit;
-	unsigned long pos_ratio;
 	unsigned long step;
 	unsigned long x;
 
@@ -972,20 +1156,18 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
 	 * The dirty rate will match the writeout rate in long term, except
 	 * when dirty pages are truncated by userspace or re-dirtied by FS.
 	 */
-	dirty_rate = (dirtied - bdi->dirtied_stamp) * HZ / elapsed;
+	dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed;
 
-	pos_ratio = bdi_position_ratio(bdi, thresh, bg_thresh, dirty,
-				       bdi_thresh, bdi_dirty);
 	/*
 	 * task_ratelimit reflects each dd's dirty rate for the past 200ms.
 	 */
 	task_ratelimit = (u64)dirty_ratelimit *
-					pos_ratio >> RATELIMIT_CALC_SHIFT;
+					dtc->pos_ratio >> RATELIMIT_CALC_SHIFT;
 	task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */
 
 	/*
 	 * A linear estimation of the "balanced" throttle rate. The theory is,
-	 * if there are N dd tasks, each throttled at task_ratelimit, the bdi's
+	 * if there are N dd tasks, each throttled at task_ratelimit, the wb's
 	 * dirty_rate will be measured to be (N * task_ratelimit). So the below
 	 * formula will yield the balanced rate limit (write_bw / N).
 	 *
@@ -1024,7 +1206,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
 	/*
 	 * We could safely do this and return immediately:
 	 *
-	 *	bdi->dirty_ratelimit = balanced_dirty_ratelimit;
+	 *	wb->dirty_ratelimit = balanced_dirty_ratelimit;
 	 *
 	 * However to get a more stable dirty_ratelimit, the below elaborated
 	 * code makes use of task_ratelimit to filter out singular points and
@@ -1058,32 +1240,31 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
 	step = 0;
 
 	/*
-	 * For strictlimit case, calculations above were based on bdi counters
-	 * and limits (starting from pos_ratio = bdi_position_ratio() and up to
+	 * For strictlimit case, calculations above were based on wb counters
+	 * and limits (starting from pos_ratio = wb_position_ratio() and up to
 	 * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate).
-	 * Hence, to calculate "step" properly, we have to use bdi_dirty as
-	 * "dirty" and bdi_setpoint as "setpoint".
+	 * Hence, to calculate "step" properly, we have to use wb_dirty as
+	 * "dirty" and wb_setpoint as "setpoint".
 	 *
-	 * We rampup dirty_ratelimit forcibly if bdi_dirty is low because
-	 * it's possible that bdi_thresh is close to zero due to inactivity
-	 * of backing device (see the implementation of bdi_dirty_limit()).
+	 * We rampup dirty_ratelimit forcibly if wb_dirty is low because
+	 * it's possible that wb_thresh is close to zero due to inactivity
+	 * of backing device.
 	 */
-	if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
-		dirty = bdi_dirty;
-		if (bdi_dirty < 8)
-			setpoint = bdi_dirty + 1;
+	if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
+		dirty = dtc->wb_dirty;
+		if (dtc->wb_dirty < 8)
+			setpoint = dtc->wb_dirty + 1;
 		else
-			setpoint = (bdi_thresh +
-				    bdi_dirty_limit(bdi, bg_thresh)) / 2;
+			setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2;
 	}
 
 	if (dirty < setpoint) {
-		x = min3(bdi->balanced_dirty_ratelimit,
+		x = min3(wb->balanced_dirty_ratelimit,
 			 balanced_dirty_ratelimit, task_ratelimit);
 		if (dirty_ratelimit < x)
 			step = x - dirty_ratelimit;
 	} else {
-		x = max3(bdi->balanced_dirty_ratelimit,
+		x = max3(wb->balanced_dirty_ratelimit,
 			 balanced_dirty_ratelimit, task_ratelimit);
 		if (dirty_ratelimit > x)
 			step = dirty_ratelimit - x;
@@ -1105,69 +1286,67 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
 	else
 		dirty_ratelimit -= step;
 
-	bdi->dirty_ratelimit = max(dirty_ratelimit, 1UL);
-	bdi->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
+	wb->dirty_ratelimit = max(dirty_ratelimit, 1UL);
+	wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
 
-	trace_bdi_dirty_ratelimit(bdi, dirty_rate, task_ratelimit);
+	trace_bdi_dirty_ratelimit(wb->bdi, dirty_rate, task_ratelimit);
 }
 
-void __bdi_update_bandwidth(struct backing_dev_info *bdi,
-			    unsigned long thresh,
-			    unsigned long bg_thresh,
-			    unsigned long dirty,
-			    unsigned long bdi_thresh,
-			    unsigned long bdi_dirty,
-			    unsigned long start_time)
+static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
+				  struct dirty_throttle_control *mdtc,
+				  unsigned long start_time,
+				  bool update_ratelimit)
 {
+	struct bdi_writeback *wb = gdtc->wb;
 	unsigned long now = jiffies;
-	unsigned long elapsed = now - bdi->bw_time_stamp;
+	unsigned long elapsed = now - wb->bw_time_stamp;
 	unsigned long dirtied;
 	unsigned long written;
 
+	lockdep_assert_held(&wb->list_lock);
+
 	/*
 	 * rate-limit, only update once every 200ms.
 	 */
 	if (elapsed < BANDWIDTH_INTERVAL)
 		return;
 
-	dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]);
-	written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
+	dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]);
+	written = percpu_counter_read(&wb->stat[WB_WRITTEN]);
 
 	/*
 	 * Skip quiet periods when disk bandwidth is under-utilized.
 	 * (at least 1s idle time between two flusher runs)
 	 */
-	if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
+	if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time))
 		goto snapshot;
 
-	if (thresh) {
-		global_update_bandwidth(thresh, dirty, now);
-		bdi_update_dirty_ratelimit(bdi, thresh, bg_thresh, dirty,
-					   bdi_thresh, bdi_dirty,
-					   dirtied, elapsed);
+	if (update_ratelimit) {
+		domain_update_bandwidth(gdtc, now);
+		wb_update_dirty_ratelimit(gdtc, dirtied, elapsed);
+
+		/*
+		 * @mdtc is always NULL if !CGROUP_WRITEBACK but the
+		 * compiler has no way to figure that out.  Help it.
+		 */
+		if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) {
+			domain_update_bandwidth(mdtc, now);
+			wb_update_dirty_ratelimit(mdtc, dirtied, elapsed);
+		}
 	}
-	bdi_update_write_bandwidth(bdi, elapsed, written);
+	wb_update_write_bandwidth(wb, elapsed, written);
 
 snapshot:
-	bdi->dirtied_stamp = dirtied;
-	bdi->written_stamp = written;
-	bdi->bw_time_stamp = now;
+	wb->dirtied_stamp = dirtied;
+	wb->written_stamp = written;
+	wb->bw_time_stamp = now;
 }
 
-static void bdi_update_bandwidth(struct backing_dev_info *bdi,
-				 unsigned long thresh,
-				 unsigned long bg_thresh,
-				 unsigned long dirty,
-				 unsigned long bdi_thresh,
-				 unsigned long bdi_dirty,
-				 unsigned long start_time)
+void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time)
 {
-	if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
-		return;
-	spin_lock(&bdi->wb.list_lock);
-	__bdi_update_bandwidth(bdi, thresh, bg_thresh, dirty,
-			       bdi_thresh, bdi_dirty, start_time);
-	spin_unlock(&bdi->wb.list_lock);
+	struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };
+
+	__wb_update_bandwidth(&gdtc, NULL, start_time, false);
 }
 
 /*
@@ -1187,10 +1366,10 @@ static unsigned long dirty_poll_interval(unsigned long dirty,
 	return 1;
 }
 
-static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
-				   unsigned long bdi_dirty)
+static unsigned long wb_max_pause(struct bdi_writeback *wb,
+				  unsigned long wb_dirty)
 {
-	unsigned long bw = bdi->avg_write_bandwidth;
+	unsigned long bw = wb->avg_write_bandwidth;
 	unsigned long t;
 
 	/*
@@ -1200,20 +1379,20 @@ static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
 	 *
 	 * 8 serves as the safety ratio.
 	 */
-	t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
+	t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
 	t++;
 
 	return min_t(unsigned long, t, MAX_PAUSE);
 }
 
-static long bdi_min_pause(struct backing_dev_info *bdi,
-			  long max_pause,
-			  unsigned long task_ratelimit,
-			  unsigned long dirty_ratelimit,
-			  int *nr_dirtied_pause)
+static long wb_min_pause(struct bdi_writeback *wb,
+			 long max_pause,
+			 unsigned long task_ratelimit,
+			 unsigned long dirty_ratelimit,
+			 int *nr_dirtied_pause)
 {
-	long hi = ilog2(bdi->avg_write_bandwidth);
-	long lo = ilog2(bdi->dirty_ratelimit);
+	long hi = ilog2(wb->avg_write_bandwidth);
+	long lo = ilog2(wb->dirty_ratelimit);
 	long t;		/* target pause */
 	long pause;	/* estimated next pause */
 	int pages;	/* target nr_dirtied_pause */
@@ -1281,34 +1460,27 @@ static long bdi_min_pause(struct backing_dev_info *bdi,
 	return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
 }
 
-static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
-				    unsigned long dirty_thresh,
-				    unsigned long background_thresh,
-				    unsigned long *bdi_dirty,
-				    unsigned long *bdi_thresh,
-				    unsigned long *bdi_bg_thresh)
+static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
 {
-	unsigned long bdi_reclaimable;
+	struct bdi_writeback *wb = dtc->wb;
+	unsigned long wb_reclaimable;
 
 	/*
-	 * bdi_thresh is not treated as some limiting factor as
+	 * wb_thresh is not treated as some limiting factor as
 	 * dirty_thresh, due to reasons
-	 * - in JBOD setup, bdi_thresh can fluctuate a lot
+	 * - in JBOD setup, wb_thresh can fluctuate a lot
 	 * - in a system with HDD and USB key, the USB key may somehow
-	 *   go into state (bdi_dirty >> bdi_thresh) either because
-	 *   bdi_dirty starts high, or because bdi_thresh drops low.
+	 *   go into state (wb_dirty >> wb_thresh) either because
+	 *   wb_dirty starts high, or because wb_thresh drops low.
 	 *   In this case we don't want to hard throttle the USB key
-	 *   dirtiers for 100 seconds until bdi_dirty drops under
-	 *   bdi_thresh. Instead the auxiliary bdi control line in
-	 *   bdi_position_ratio() will let the dirtier task progress
-	 *   at some rate <= (write_bw / 2) for bringing down bdi_dirty.
+	 *   dirtiers for 100 seconds until wb_dirty drops under
+	 *   wb_thresh. Instead the auxiliary wb control line in
+	 *   wb_position_ratio() will let the dirtier task progress
+	 *   at some rate <= (write_bw / 2) for bringing down wb_dirty.
 	 */
-	*bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
-
-	if (bdi_bg_thresh)
-		*bdi_bg_thresh = dirty_thresh ? div_u64((u64)*bdi_thresh *
-							background_thresh,
-							dirty_thresh) : 0;
+	dtc->wb_thresh = __wb_calc_thresh(dtc);
+	dtc->wb_bg_thresh = dtc->thresh ?
+		div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;
 
 	/*
 	 * In order to avoid the stacked BDI deadlock we need
@@ -1320,14 +1492,12 @@ static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
 	 * actually dirty; with m+n sitting in the percpu
 	 * deltas.
 	 */
-	if (*bdi_thresh < 2 * bdi_stat_error(bdi)) {
-		bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
-		*bdi_dirty = bdi_reclaimable +
-			bdi_stat_sum(bdi, BDI_WRITEBACK);
+	if (dtc->wb_thresh < 2 * wb_stat_error(wb)) {
+		wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
+		dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK);
 	} else {
-		bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
-		*bdi_dirty = bdi_reclaimable +
-			bdi_stat(bdi, BDI_WRITEBACK);
+		wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE);
+		dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK);
 	}
 }
 
@@ -1339,12 +1509,16 @@ static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
  * perform some writeout.
  */
 static void balance_dirty_pages(struct address_space *mapping,
+				struct bdi_writeback *wb,
 				unsigned long pages_dirtied)
 {
+	struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
+	struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
+	struct dirty_throttle_control * const gdtc = &gdtc_stor;
+	struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
+						     &mdtc_stor : NULL;
+	struct dirty_throttle_control *sdtc;
 	unsigned long nr_reclaimable;	/* = file_dirty + unstable_nfs */
-	unsigned long nr_dirty;  /* = file_dirty + writeback + unstable_nfs */
-	unsigned long background_thresh;
-	unsigned long dirty_thresh;
 	long period;
 	long pause;
 	long max_pause;
@@ -1353,18 +1527,14 @@ static void balance_dirty_pages(struct address_space *mapping,
 	bool dirty_exceeded = false;
 	unsigned long task_ratelimit;
 	unsigned long dirty_ratelimit;
-	unsigned long pos_ratio;
-	struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+	struct backing_dev_info *bdi = wb->bdi;
 	bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
 	unsigned long start_time = jiffies;
 
 	for (;;) {
 		unsigned long now = jiffies;
-		unsigned long uninitialized_var(bdi_thresh);
-		unsigned long thresh;
-		unsigned long uninitialized_var(bdi_dirty);
-		unsigned long dirty;
-		unsigned long bg_thresh;
+		unsigned long dirty, thresh, bg_thresh;
+		unsigned long m_dirty, m_thresh, m_bg_thresh;
 
 		/*
 		 * Unstable writes are a feature of certain networked
@@ -1374,65 +1544,127 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 */
 		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
 					global_page_state(NR_UNSTABLE_NFS);
-		nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
+		gdtc->avail = global_dirtyable_memory();
+		gdtc->dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
 
-		global_dirty_limits(&background_thresh, &dirty_thresh);
+		domain_dirty_limits(gdtc);
 
 		if (unlikely(strictlimit)) {
-			bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
-					 &bdi_dirty, &bdi_thresh, &bg_thresh);
+			wb_dirty_limits(gdtc);
 
-			dirty = bdi_dirty;
-			thresh = bdi_thresh;
+			dirty = gdtc->wb_dirty;
+			thresh = gdtc->wb_thresh;
+			bg_thresh = gdtc->wb_bg_thresh;
 		} else {
-			dirty = nr_dirty;
-			thresh = dirty_thresh;
-			bg_thresh = background_thresh;
+			dirty = gdtc->dirty;
+			thresh = gdtc->thresh;
+			bg_thresh = gdtc->bg_thresh;
+		}
+
+		if (mdtc) {
+			unsigned long writeback;
+
+			/*
+			 * If @wb belongs to !root memcg, repeat the same
+			 * basic calculations for the memcg domain.
+			 */
+			mem_cgroup_wb_stats(wb, &mdtc->avail, &mdtc->dirty,
+					    &writeback);
+			mdtc_cap_avail(mdtc);
+			mdtc->dirty += writeback;
+
+			domain_dirty_limits(mdtc);
+
+			if (unlikely(strictlimit)) {
+				wb_dirty_limits(mdtc);
+				m_dirty = mdtc->wb_dirty;
+				m_thresh = mdtc->wb_thresh;
+				m_bg_thresh = mdtc->wb_bg_thresh;
+			} else {
+				m_dirty = mdtc->dirty;
+				m_thresh = mdtc->thresh;
+				m_bg_thresh = mdtc->bg_thresh;
+			}
 		}
 
 		/*
 		 * Throttle it only when the background writeback cannot
 		 * catch-up. This avoids (excessively) small writeouts
-		 * when the bdi limits are ramping up in case of !strictlimit.
+		 * when the wb limits are ramping up in case of !strictlimit.
 		 *
-		 * In strictlimit case make decision based on the bdi counters
-		 * and limits. Small writeouts when the bdi limits are ramping
+		 * In strictlimit case make decision based on the wb counters
+		 * and limits. Small writeouts when the wb limits are ramping
 		 * up are the price we consciously pay for strictlimit-ing.
+		 *
+		 * If memcg domain is in effect, @dirty should be under
+		 * both global and memcg freerun ceilings.
 		 */
-		if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh)) {
+		if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&
+		    (!mdtc ||
+		     m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) {
+			unsigned long intv = dirty_poll_interval(dirty, thresh);
+			unsigned long m_intv = ULONG_MAX;
+
 			current->dirty_paused_when = now;
 			current->nr_dirtied = 0;
-			current->nr_dirtied_pause =
-				dirty_poll_interval(dirty, thresh);
+			if (mdtc)
+				m_intv = dirty_poll_interval(m_dirty, m_thresh);
+			current->nr_dirtied_pause = min(intv, m_intv);
 			break;
 		}
 
-		if (unlikely(!writeback_in_progress(bdi)))
-			bdi_start_background_writeback(bdi);
+		if (unlikely(!writeback_in_progress(wb)))
+			wb_start_background_writeback(wb);
 
+		/*
+		 * Calculate global domain's pos_ratio and select the
+		 * global dtc by default.
+		 */
 		if (!strictlimit)
-			bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
-					 &bdi_dirty, &bdi_thresh, NULL);
-
-		dirty_exceeded = (bdi_dirty > bdi_thresh) &&
-				 ((nr_dirty > dirty_thresh) || strictlimit);
-		if (dirty_exceeded && !bdi->dirty_exceeded)
-			bdi->dirty_exceeded = 1;
-
-		bdi_update_bandwidth(bdi, dirty_thresh, background_thresh,
-				     nr_dirty, bdi_thresh, bdi_dirty,
-				     start_time);
-
-		dirty_ratelimit = bdi->dirty_ratelimit;
-		pos_ratio = bdi_position_ratio(bdi, dirty_thresh,
-					       background_thresh, nr_dirty,
-					       bdi_thresh, bdi_dirty);
-		task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>
+			wb_dirty_limits(gdtc);
+
+		dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) &&
+			((gdtc->dirty > gdtc->thresh) || strictlimit);
+
+		wb_position_ratio(gdtc);
+		sdtc = gdtc;
+
+		if (mdtc) {
+			/*
+			 * If memcg domain is in effect, calculate its
+			 * pos_ratio.  @wb should satisfy constraints from
+			 * both global and memcg domains.  Choose the one
+			 * w/ lower pos_ratio.
+			 */
+			if (!strictlimit)
+				wb_dirty_limits(mdtc);
+
+			dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
+				((mdtc->dirty > mdtc->thresh) || strictlimit);
+
+			wb_position_ratio(mdtc);
+			if (mdtc->pos_ratio < gdtc->pos_ratio)
+				sdtc = mdtc;
+		}
+
+		if (dirty_exceeded && !wb->dirty_exceeded)
+			wb->dirty_exceeded = 1;
+
+		if (time_is_before_jiffies(wb->bw_time_stamp +
+					   BANDWIDTH_INTERVAL)) {
+			spin_lock(&wb->list_lock);
+			__wb_update_bandwidth(gdtc, mdtc, start_time, true);
+			spin_unlock(&wb->list_lock);
+		}
+
+		/* throttle according to the chosen dtc */
+		dirty_ratelimit = wb->dirty_ratelimit;
+		task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >>
 							RATELIMIT_CALC_SHIFT;
-		max_pause = bdi_max_pause(bdi, bdi_dirty);
-		min_pause = bdi_min_pause(bdi, max_pause,
-					  task_ratelimit, dirty_ratelimit,
-					  &nr_dirtied_pause);
+		max_pause = wb_max_pause(wb, sdtc->wb_dirty);
+		min_pause = wb_min_pause(wb, max_pause,
+					 task_ratelimit, dirty_ratelimit,
+					 &nr_dirtied_pause);
 
 		if (unlikely(task_ratelimit == 0)) {
 			period = max_pause;
@@ -1452,11 +1684,11 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 */
 		if (pause < min_pause) {
 			trace_balance_dirty_pages(bdi,
-						  dirty_thresh,
-						  background_thresh,
-						  nr_dirty,
-						  bdi_thresh,
-						  bdi_dirty,
+						  sdtc->thresh,
+						  sdtc->bg_thresh,
+						  sdtc->dirty,
+						  sdtc->wb_thresh,
+						  sdtc->wb_dirty,
 						  dirty_ratelimit,
 						  task_ratelimit,
 						  pages_dirtied,
@@ -1481,11 +1713,11 @@ static void balance_dirty_pages(struct address_space *mapping,
 
 pause:
 		trace_balance_dirty_pages(bdi,
-					  dirty_thresh,
-					  background_thresh,
-					  nr_dirty,
-					  bdi_thresh,
-					  bdi_dirty,
+					  sdtc->thresh,
+					  sdtc->bg_thresh,
+					  sdtc->dirty,
+					  sdtc->wb_thresh,
+					  sdtc->wb_dirty,
 					  dirty_ratelimit,
 					  task_ratelimit,
 					  pages_dirtied,
@@ -1500,33 +1732,33 @@ pause:
 		current->nr_dirtied_pause = nr_dirtied_pause;
 
 		/*
-		 * This is typically equal to (nr_dirty < dirty_thresh) and can
-		 * also keep "1000+ dd on a slow USB stick" under control.
+		 * This is typically equal to (dirty < thresh) and can also
+		 * keep "1000+ dd on a slow USB stick" under control.
 		 */
 		if (task_ratelimit)
 			break;
 
 		/*
 		 * In the case of an unresponding NFS server and the NFS dirty
-		 * pages exceeds dirty_thresh, give the other good bdi's a pipe
+		 * pages exceeds dirty_thresh, give the other good wb's a pipe
 		 * to go through, so that tasks on them still remain responsive.
 		 *
 		 * In theory 1 page is enough to keep the comsumer-producer
 		 * pipe going: the flusher cleans 1 page => the task dirties 1
-		 * more page. However bdi_dirty has accounting errors.  So use
-		 * the larger and more IO friendly bdi_stat_error.
+		 * more page. However wb_dirty has accounting errors.  So use
+		 * the larger and more IO friendly wb_stat_error.
 		 */
-		if (bdi_dirty <= bdi_stat_error(bdi))
+		if (sdtc->wb_dirty <= wb_stat_error(wb))
 			break;
 
 		if (fatal_signal_pending(current))
 			break;
 	}
 
-	if (!dirty_exceeded && bdi->dirty_exceeded)
-		bdi->dirty_exceeded = 0;
+	if (!dirty_exceeded && wb->dirty_exceeded)
+		wb->dirty_exceeded = 0;
 
-	if (writeback_in_progress(bdi))
+	if (writeback_in_progress(wb))
 		return;
 
 	/*
@@ -1540,8 +1772,8 @@ pause:
 	if (laptop_mode)
 		return;
 
-	if (nr_reclaimable > background_thresh)
-		bdi_start_background_writeback(bdi);
+	if (nr_reclaimable > gdtc->bg_thresh)
+		wb_start_background_writeback(wb);
 }
 
 static DEFINE_PER_CPU(int, bdp_ratelimits);
@@ -1577,15 +1809,22 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
  */
 void balance_dirty_pages_ratelimited(struct address_space *mapping)
 {
-	struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+	struct inode *inode = mapping->host;
+	struct backing_dev_info *bdi = inode_to_bdi(inode);
+	struct bdi_writeback *wb = NULL;
 	int ratelimit;
 	int *p;
 
 	if (!bdi_cap_account_dirty(bdi))
 		return;
 
+	if (inode_cgwb_enabled(inode))
+		wb = wb_get_create_current(bdi, GFP_KERNEL);
+	if (!wb)
+		wb = &bdi->wb;
+
 	ratelimit = current->nr_dirtied_pause;
-	if (bdi->dirty_exceeded)
+	if (wb->dirty_exceeded)
 		ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
 
 	preempt_disable();
@@ -1617,10 +1856,59 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
 	preempt_enable();
 
 	if (unlikely(current->nr_dirtied >= ratelimit))
-		balance_dirty_pages(mapping, current->nr_dirtied);
+		balance_dirty_pages(mapping, wb, current->nr_dirtied);
+
+	wb_put(wb);
 }
 EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
 
+/**
+ * wb_over_bg_thresh - does @wb need to be written back?
+ * @wb: bdi_writeback of interest
+ *
+ * Determines whether background writeback should keep writing @wb or it's
+ * clean enough.  Returns %true if writeback should continue.
+ */
+bool wb_over_bg_thresh(struct bdi_writeback *wb)
+{
+	struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
+	struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
+	struct dirty_throttle_control * const gdtc = &gdtc_stor;
+	struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
+						     &mdtc_stor : NULL;
+
+	/*
+	 * Similar to balance_dirty_pages() but ignores pages being written
+	 * as we're trying to decide whether to put more under writeback.
+	 */
+	gdtc->avail = global_dirtyable_memory();
+	gdtc->dirty = global_page_state(NR_FILE_DIRTY) +
+		      global_page_state(NR_UNSTABLE_NFS);
+	domain_dirty_limits(gdtc);
+
+	if (gdtc->dirty > gdtc->bg_thresh)
+		return true;
+
+	if (wb_stat(wb, WB_RECLAIMABLE) > __wb_calc_thresh(gdtc))
+		return true;
+
+	if (mdtc) {
+		unsigned long writeback;
+
+		mem_cgroup_wb_stats(wb, &mdtc->avail, &mdtc->dirty, &writeback);
+		mdtc_cap_avail(mdtc);
+		domain_dirty_limits(mdtc);	/* ditto, ignore writeback */
+
+		if (mdtc->dirty > mdtc->bg_thresh)
+			return true;
+
+		if (wb_stat(wb, WB_RECLAIMABLE) > __wb_calc_thresh(mdtc))
+			return true;
+	}
+
+	return false;
+}
+
 void throttle_vm_writeout(gfp_t gfp_mask)
 {
 	unsigned long background_thresh;
@@ -1628,7 +1916,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
 
         for ( ; ; ) {
 		global_dirty_limits(&background_thresh, &dirty_thresh);
-		dirty_thresh = hard_dirty_limit(dirty_thresh);
+		dirty_thresh = hard_dirty_limit(&global_wb_domain, dirty_thresh);
 
                 /*
                  * Boost the allowable dirty threshold a bit for page
@@ -1667,14 +1955,20 @@ void laptop_mode_timer_fn(unsigned long data)
 	struct request_queue *q = (struct request_queue *)data;
 	int nr_pages = global_page_state(NR_FILE_DIRTY) +
 		global_page_state(NR_UNSTABLE_NFS);
+	struct bdi_writeback *wb;
+	struct wb_iter iter;
 
 	/*
 	 * We want to write everything out, not just down to the dirty
 	 * threshold
 	 */
-	if (bdi_has_dirty_io(&q->backing_dev_info))
-		bdi_start_writeback(&q->backing_dev_info, nr_pages,
-					WB_REASON_LAPTOP_TIMER);
+	if (!bdi_has_dirty_io(&q->backing_dev_info))
+		return;
+
+	bdi_for_each_wb(wb, &q->backing_dev_info, &iter, 0)
+		if (wb_has_dirty_io(wb))
+			wb_start_writeback(wb, nr_pages, true,
+					   WB_REASON_LAPTOP_TIMER);
 }
 
 /*
@@ -1718,10 +2012,12 @@ void laptop_sync_completion(void)
 
 void writeback_set_ratelimit(void)
 {
+	struct wb_domain *dom = &global_wb_domain;
 	unsigned long background_thresh;
 	unsigned long dirty_thresh;
+
 	global_dirty_limits(&background_thresh, &dirty_thresh);
-	global_dirty_limit = dirty_thresh;
+	dom->dirty_limit = dirty_thresh;
 	ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
 	if (ratelimit_pages < 16)
 		ratelimit_pages = 16;
@@ -1770,7 +2066,7 @@ void __init page_writeback_init(void)
 	writeback_set_ratelimit();
 	register_cpu_notifier(&ratelimit_nb);
 
-	fprop_global_init(&writeout_completions, GFP_KERNEL);
+	BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL));
 }
 
 /**
@@ -2090,19 +2386,29 @@ int __set_page_dirty_no_writeback(struct page *page)
 
 /*
  * Helper function for set_page_dirty family.
+ *
+ * Caller must hold mem_cgroup_begin_page_stat().
+ *
  * NOTE: This relies on being atomic wrt interrupts.
  */
-void account_page_dirtied(struct page *page, struct address_space *mapping)
+void account_page_dirtied(struct page *page, struct address_space *mapping,
+			  struct mem_cgroup *memcg)
 {
+	struct inode *inode = mapping->host;
+
 	trace_writeback_dirty_page(page, mapping);
 
 	if (mapping_cap_account_dirty(mapping)) {
-		struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+		struct bdi_writeback *wb;
 
+		inode_attach_wb(inode, page);
+		wb = inode_to_wb(inode);
+
+		mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
 		__inc_zone_page_state(page, NR_FILE_DIRTY);
 		__inc_zone_page_state(page, NR_DIRTIED);
-		__inc_bdi_stat(bdi, BDI_RECLAIMABLE);
-		__inc_bdi_stat(bdi, BDI_DIRTIED);
+		__inc_wb_stat(wb, WB_RECLAIMABLE);
+		__inc_wb_stat(wb, WB_DIRTIED);
 		task_io_account_write(PAGE_CACHE_SIZE);
 		current->nr_dirtied++;
 		this_cpu_inc(bdp_ratelimits);
@@ -2113,21 +2419,18 @@ EXPORT_SYMBOL(account_page_dirtied);
 /*
  * Helper function for deaccounting dirty page without writeback.
  *
- * Doing this should *normally* only ever be done when a page
- * is truncated, and is not actually mapped anywhere at all. However,
- * fs/buffer.c does this when it notices that somebody has cleaned
- * out all the buffers on a page without actually doing it through
- * the VM. Can you say "ext3 is horribly ugly"? Thought you could.
+ * Caller must hold mem_cgroup_begin_page_stat().
  */
-void account_page_cleaned(struct page *page, struct address_space *mapping)
+void account_page_cleaned(struct page *page, struct address_space *mapping,
+			  struct mem_cgroup *memcg, struct bdi_writeback *wb)
 {
 	if (mapping_cap_account_dirty(mapping)) {
+		mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
 		dec_zone_page_state(page, NR_FILE_DIRTY);
-		dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE);
+		dec_wb_stat(wb, WB_RECLAIMABLE);
 		task_io_account_cancelled_write(PAGE_CACHE_SIZE);
 	}
 }
-EXPORT_SYMBOL(account_page_cleaned);
 
 /*
  * For address_spaces which do not use buffers.  Just tag the page as dirty in
@@ -2143,26 +2446,34 @@ EXPORT_SYMBOL(account_page_cleaned);
  */
 int __set_page_dirty_nobuffers(struct page *page)
 {
+	struct mem_cgroup *memcg;
+
+	memcg = mem_cgroup_begin_page_stat(page);
 	if (!TestSetPageDirty(page)) {
 		struct address_space *mapping = page_mapping(page);
 		unsigned long flags;
 
-		if (!mapping)
+		if (!mapping) {
+			mem_cgroup_end_page_stat(memcg);
 			return 1;
+		}
 
 		spin_lock_irqsave(&mapping->tree_lock, flags);
 		BUG_ON(page_mapping(page) != mapping);
 		WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
-		account_page_dirtied(page, mapping);
+		account_page_dirtied(page, mapping, memcg);
 		radix_tree_tag_set(&mapping->page_tree, page_index(page),
 				   PAGECACHE_TAG_DIRTY);
 		spin_unlock_irqrestore(&mapping->tree_lock, flags);
+		mem_cgroup_end_page_stat(memcg);
+
 		if (mapping->host) {
 			/* !PageAnon && !swapper_space */
 			__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 		}
 		return 1;
 	}
+	mem_cgroup_end_page_stat(memcg);
 	return 0;
 }
 EXPORT_SYMBOL(__set_page_dirty_nobuffers);
@@ -2177,10 +2488,17 @@ EXPORT_SYMBOL(__set_page_dirty_nobuffers);
 void account_page_redirty(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
+
 	if (mapping && mapping_cap_account_dirty(mapping)) {
+		struct inode *inode = mapping->host;
+		struct bdi_writeback *wb;
+		bool locked;
+
+		wb = unlocked_inode_to_wb_begin(inode, &locked);
 		current->nr_dirtied--;
 		dec_zone_page_state(page, NR_DIRTIED);
-		dec_bdi_stat(inode_to_bdi(mapping->host), BDI_DIRTIED);
+		dec_wb_stat(wb, WB_DIRTIED);
+		unlocked_inode_to_wb_end(inode, locked);
 	}
 }
 EXPORT_SYMBOL(account_page_redirty);
@@ -2266,6 +2584,43 @@ int set_page_dirty_lock(struct page *page)
 EXPORT_SYMBOL(set_page_dirty_lock);
 
 /*
+ * This cancels just the dirty bit on the kernel page itself, it does NOT
+ * actually remove dirty bits on any mmap's that may be around. It also
+ * leaves the page tagged dirty, so any sync activity will still find it on
+ * the dirty lists, and in particular, clear_page_dirty_for_io() will still
+ * look at the dirty bits in the VM.
+ *
+ * Doing this should *normally* only ever be done when a page is truncated,
+ * and is not actually mapped anywhere at all. However, fs/buffer.c does
+ * this when it notices that somebody has cleaned out all the buffers on a
+ * page without actually doing it through the VM. Can you say "ext3 is
+ * horribly ugly"? Thought you could.
+ */
+void cancel_dirty_page(struct page *page)
+{
+	struct address_space *mapping = page_mapping(page);
+
+	if (mapping_cap_account_dirty(mapping)) {
+		struct inode *inode = mapping->host;
+		struct bdi_writeback *wb;
+		struct mem_cgroup *memcg;
+		bool locked;
+
+		memcg = mem_cgroup_begin_page_stat(page);
+		wb = unlocked_inode_to_wb_begin(inode, &locked);
+
+		if (TestClearPageDirty(page))
+			account_page_cleaned(page, mapping, memcg, wb);
+
+		unlocked_inode_to_wb_end(inode, locked);
+		mem_cgroup_end_page_stat(memcg);
+	} else {
+		ClearPageDirty(page);
+	}
+}
+EXPORT_SYMBOL(cancel_dirty_page);
+
+/*
  * Clear a page's dirty flag, while caring for dirty memory accounting.
  * Returns true if the page was previously dirty.
  *
@@ -2282,10 +2637,16 @@ EXPORT_SYMBOL(set_page_dirty_lock);
 int clear_page_dirty_for_io(struct page *page)
 {
 	struct address_space *mapping = page_mapping(page);
+	int ret = 0;
 
 	BUG_ON(!PageLocked(page));
 
 	if (mapping && mapping_cap_account_dirty(mapping)) {
+		struct inode *inode = mapping->host;
+		struct bdi_writeback *wb;
+		struct mem_cgroup *memcg;
+		bool locked;
+
 		/*
 		 * Yes, Virginia, this is indeed insane.
 		 *
@@ -2321,13 +2682,17 @@ int clear_page_dirty_for_io(struct page *page)
 		 * always locked coming in here, so we get the desired
 		 * exclusion.
 		 */
+		memcg = mem_cgroup_begin_page_stat(page);
+		wb = unlocked_inode_to_wb_begin(inode, &locked);
 		if (TestClearPageDirty(page)) {
+			mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
 			dec_zone_page_state(page, NR_FILE_DIRTY);
-			dec_bdi_stat(inode_to_bdi(mapping->host),
-					BDI_RECLAIMABLE);
-			return 1;
+			dec_wb_stat(wb, WB_RECLAIMABLE);
+			ret = 1;
 		}
-		return 0;
+		unlocked_inode_to_wb_end(inode, locked);
+		mem_cgroup_end_page_stat(memcg);
+		return ret;
 	}
 	return TestClearPageDirty(page);
 }
@@ -2341,7 +2706,8 @@ int test_clear_page_writeback(struct page *page)
 
 	memcg = mem_cgroup_begin_page_stat(page);
 	if (mapping) {
-		struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+		struct inode *inode = mapping->host;
+		struct backing_dev_info *bdi = inode_to_bdi(inode);
 		unsigned long flags;
 
 		spin_lock_irqsave(&mapping->tree_lock, flags);
@@ -2351,8 +2717,10 @@ int test_clear_page_writeback(struct page *page)
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
 			if (bdi_cap_account_writeback(bdi)) {
-				__dec_bdi_stat(bdi, BDI_WRITEBACK);
-				__bdi_writeout_inc(bdi);
+				struct bdi_writeback *wb = inode_to_wb(inode);
+
+				__dec_wb_stat(wb, WB_WRITEBACK);
+				__wb_writeout_inc(wb);
 			}
 		}
 		spin_unlock_irqrestore(&mapping->tree_lock, flags);
@@ -2376,7 +2744,8 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
 
 	memcg = mem_cgroup_begin_page_stat(page);
 	if (mapping) {
-		struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+		struct inode *inode = mapping->host;
+		struct backing_dev_info *bdi = inode_to_bdi(inode);
 		unsigned long flags;
 
 		spin_lock_irqsave(&mapping->tree_lock, flags);
@@ -2386,7 +2755,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
 			if (bdi_cap_account_writeback(bdi))
-				__inc_bdi_stat(bdi, BDI_WRITEBACK);
+				__inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
 		}
 		if (!PageDirty(page))
 			radix_tree_tag_clear(&mapping->page_tree,
diff --git a/mm/page_io.c b/mm/page_io.c
index 6424869e275e2a..520baa4b04d75d 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -69,7 +69,7 @@ void end_swap_bio_write(struct bio *bio, int err)
 	bio_put(bio);
 }
 
-void end_swap_bio_read(struct bio *bio, int err)
+static void end_swap_bio_read(struct bio *bio, int err)
 {
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct page *page = bio->bi_io_vec[0].bv_page;
diff --git a/mm/readahead.c b/mm/readahead.c
index 935675844b2ee4..60cd846a9a4401 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -541,7 +541,7 @@ page_cache_async_readahead(struct address_space *mapping,
 	/*
 	 * Defer asynchronous read-ahead on IO congestion.
 	 */
-	if (bdi_read_congested(inode_to_bdi(mapping->host)))
+	if (inode_read_congested(mapping->host))
 		return;
 
 	/* do read-ahead */
diff --git a/mm/rmap.c b/mm/rmap.c
index 24dd3f9fee27df..8fc556ce2dcb7f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -30,6 +30,8 @@
  *             swap_lock (in swap_duplicate, swap_info_get)
  *               mmlist_lock (in mmput, drain_mmlist and others)
  *               mapping->private_lock (in __set_page_dirty_buffers)
+ *                 mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
+ *                   mapping->tree_lock (widely used)
  *               inode->i_lock (in set_page_dirty's __mark_inode_dirty)
  *               bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
  *                 sb_lock (within inode_lock in fs/fs-writeback.c)
diff --git a/mm/truncate.c b/mm/truncate.c
index 66af9031fae807..76e35ad971025c 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -116,9 +116,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
 	 * the VM has canceled the dirty bit (eg ext3 journaling).
 	 * Hence dirty accounting check is placed after invalidation.
 	 */
-	if (TestClearPageDirty(page))
-		account_page_cleaned(page, mapping);
-
+	cancel_dirty_page(page);
 	ClearPageMappedToDisk(page);
 	delete_from_page_cache(page);
 	return 0;
@@ -512,19 +510,24 @@ EXPORT_SYMBOL(invalidate_mapping_pages);
 static int
 invalidate_complete_page2(struct address_space *mapping, struct page *page)
 {
+	struct mem_cgroup *memcg;
+	unsigned long flags;
+
 	if (page->mapping != mapping)
 		return 0;
 
 	if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
 		return 0;
 
-	spin_lock_irq(&mapping->tree_lock);
+	memcg = mem_cgroup_begin_page_stat(page);
+	spin_lock_irqsave(&mapping->tree_lock, flags);
 	if (PageDirty(page))
 		goto failed;
 
 	BUG_ON(page_has_private(page));
-	__delete_from_page_cache(page, NULL);
-	spin_unlock_irq(&mapping->tree_lock);
+	__delete_from_page_cache(page, NULL, memcg);
+	spin_unlock_irqrestore(&mapping->tree_lock, flags);
+	mem_cgroup_end_page_stat(memcg);
 
 	if (mapping->a_ops->freepage)
 		mapping->a_ops->freepage(page);
@@ -532,7 +535,8 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
 	page_cache_release(page);	/* pagecache ref */
 	return 1;
 failed:
-	spin_unlock_irq(&mapping->tree_lock);
+	spin_unlock_irqrestore(&mapping->tree_lock, flags);
+	mem_cgroup_end_page_stat(memcg);
 	return 0;
 }
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5e8eadd71bac71..8cb16ebaf3ed08 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -154,11 +154,42 @@ static bool global_reclaim(struct scan_control *sc)
 {
 	return !sc->target_mem_cgroup;
 }
+
+/**
+ * sane_reclaim - is the usual dirty throttling mechanism operational?
+ * @sc: scan_control in question
+ *
+ * The normal page dirty throttling mechanism in balance_dirty_pages() is
+ * completely broken with the legacy memcg and direct stalling in
+ * shrink_page_list() is used for throttling instead, which lacks all the
+ * niceties such as fairness, adaptive pausing, bandwidth proportional
+ * allocation and configurability.
+ *
+ * This function tests whether the vmscan currently in progress can assume
+ * that the normal dirty throttling mechanism is operational.
+ */
+static bool sane_reclaim(struct scan_control *sc)
+{
+	struct mem_cgroup *memcg = sc->target_mem_cgroup;
+
+	if (!memcg)
+		return true;
+#ifdef CONFIG_CGROUP_WRITEBACK
+	if (cgroup_on_dfl(mem_cgroup_css(memcg)->cgroup))
+		return true;
+#endif
+	return false;
+}
 #else
 static bool global_reclaim(struct scan_control *sc)
 {
 	return true;
 }
+
+static bool sane_reclaim(struct scan_control *sc)
+{
+	return true;
+}
 #endif
 
 static unsigned long zone_reclaimable_pages(struct zone *zone)
@@ -452,14 +483,13 @@ static inline int is_page_cache_freeable(struct page *page)
 	return page_count(page) - page_has_private(page) == 2;
 }
 
-static int may_write_to_queue(struct backing_dev_info *bdi,
-			      struct scan_control *sc)
+static int may_write_to_inode(struct inode *inode, struct scan_control *sc)
 {
 	if (current->flags & PF_SWAPWRITE)
 		return 1;
-	if (!bdi_write_congested(bdi))
+	if (!inode_write_congested(inode))
 		return 1;
-	if (bdi == current->backing_dev_info)
+	if (inode_to_bdi(inode) == current->backing_dev_info)
 		return 1;
 	return 0;
 }
@@ -538,7 +568,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
 	}
 	if (mapping->a_ops->writepage == NULL)
 		return PAGE_ACTIVATE;
-	if (!may_write_to_queue(inode_to_bdi(mapping->host), sc))
+	if (!may_write_to_inode(mapping->host, sc))
 		return PAGE_KEEP;
 
 	if (clear_page_dirty_for_io(page)) {
@@ -579,10 +609,14 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
 static int __remove_mapping(struct address_space *mapping, struct page *page,
 			    bool reclaimed)
 {
+	unsigned long flags;
+	struct mem_cgroup *memcg;
+
 	BUG_ON(!PageLocked(page));
 	BUG_ON(mapping != page_mapping(page));
 
-	spin_lock_irq(&mapping->tree_lock);
+	memcg = mem_cgroup_begin_page_stat(page);
+	spin_lock_irqsave(&mapping->tree_lock, flags);
 	/*
 	 * The non racy check for a busy page.
 	 *
@@ -620,7 +654,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
 		swp_entry_t swap = { .val = page_private(page) };
 		mem_cgroup_swapout(page, swap);
 		__delete_from_swap_cache(page);
-		spin_unlock_irq(&mapping->tree_lock);
+		spin_unlock_irqrestore(&mapping->tree_lock, flags);
+		mem_cgroup_end_page_stat(memcg);
 		swapcache_free(swap);
 	} else {
 		void (*freepage)(struct page *);
@@ -640,8 +675,9 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
 		if (reclaimed && page_is_file_cache(page) &&
 		    !mapping_exiting(mapping))
 			shadow = workingset_eviction(mapping, page);
-		__delete_from_page_cache(page, shadow);
-		spin_unlock_irq(&mapping->tree_lock);
+		__delete_from_page_cache(page, shadow, memcg);
+		spin_unlock_irqrestore(&mapping->tree_lock, flags);
+		mem_cgroup_end_page_stat(memcg);
 
 		if (freepage != NULL)
 			freepage(page);
@@ -650,7 +686,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
 	return 1;
 
 cannot_free:
-	spin_unlock_irq(&mapping->tree_lock);
+	spin_unlock_irqrestore(&mapping->tree_lock, flags);
+	mem_cgroup_end_page_stat(memcg);
 	return 0;
 }
 
@@ -917,7 +954,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 */
 		mapping = page_mapping(page);
 		if (((dirty || writeback) && mapping &&
-		     bdi_write_congested(inode_to_bdi(mapping->host))) ||
+		     inode_write_congested(mapping->host)) ||
 		    (writeback && PageReclaim(page)))
 			nr_congested++;
 
@@ -935,10 +972,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 *    note that the LRU is being scanned too quickly and the
 		 *    caller can stall after page list has been processed.
 		 *
-		 * 2) Global reclaim encounters a page, memcg encounters a
-		 *    page that is not marked for immediate reclaim or
-		 *    the caller does not have __GFP_IO. In this case mark
-		 *    the page for immediate reclaim and continue scanning.
+		 * 2) Global or new memcg reclaim encounters a page that is
+		 *    not marked for immediate reclaim or the caller does not
+		 *    have __GFP_IO. In this case mark the page for immediate
+		 *    reclaim and continue scanning.
 		 *
 		 *    __GFP_IO is checked  because a loop driver thread might
 		 *    enter reclaim, and deadlock if it waits on a page for
@@ -952,7 +989,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 *    grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
 		 *    may_enter_fs here is liable to OOM on them.
 		 *
-		 * 3) memcg encounters a page that is not already marked
+		 * 3) Legacy memcg encounters a page that is not already marked
 		 *    PageReclaim. memcg does not have any dirty pages
 		 *    throttling so we could easily OOM just because too many
 		 *    pages are in writeback and there is nothing else to
@@ -967,7 +1004,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 				goto keep_locked;
 
 			/* Case 2 above */
-			} else if (global_reclaim(sc) ||
+			} else if (sane_reclaim(sc) ||
 			    !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
 				/*
 				 * This is slightly racy - end_page_writeback()
@@ -1416,7 +1453,7 @@ static int too_many_isolated(struct zone *zone, int file,
 	if (current_is_kswapd())
 		return 0;
 
-	if (!global_reclaim(sc))
+	if (!sane_reclaim(sc))
 		return 0;
 
 	if (file) {
@@ -1608,10 +1645,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 		set_bit(ZONE_WRITEBACK, &zone->flags);
 
 	/*
-	 * memcg will stall in page writeback so only consider forcibly
-	 * stalling for global reclaim
+	 * Legacy memcg will stall in page writeback so avoid forcibly
+	 * stalling here.
 	 */
-	if (global_reclaim(sc)) {
+	if (sane_reclaim(sc)) {
 		/*
 		 * Tag a zone as congested if all the dirty pages scanned were
 		 * backed by a congested BDI and wait_iff_congested will stall.
author	Michael Ellerman <mpe@ellerman.id.au>	2015-06-05 16:08:29 +1000
committer	Michael Ellerman <mpe@ellerman.id.au>	2015-06-05 16:08:29 +1000
commit	a71e49de186dc0b08f5218b783cbe64ded9cd983 (patch)
tree	b53e89ab4d39937bd36b022ee8d1b2e37e2179af
parent	5cd8ae4c0ec4193aece58d0460eab2a86b0e2187 (diff)
parent	613a2e5fb8db8dae722ee80dabc216bb65cb1a86 (diff)
download	linux-next-a71e49de186dc0b08f5218b783cbe64ded9cd983.tar.gz