This patch implements the TCP transport driver for the NVMe over Fabrics target stack. This allows exporting NVMe over Fabrics functionality over good old TCP/IP. The driver implements the TP 8000 of how nvme over fabrics capsules and data are encapsulated in nvme-tcp pdus and exchaged on top of a TCP byte stream. nvme-tcp header and data digest are supported as well. Signed-off-by: Sagi Grimberg <sagi@lightbitslabs.com> Signed-off-by: Roy Shterman <roys@lightbitslabs.com> Signed-off-by: Solganik Alexander <sashas@lightbitslabs.com> Signed-off-by: Christoph Hellwig <hch@lst.de>

3 files changed, 1749 insertions, 0 deletions
diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig
index 3c7b61ddb0d186..d94f25cde0192f 100644
--- a/drivers/nvme/target/Kconfig
+++ b/drivers/nvme/target/Kconfig
@@ -60,3 +60,13 @@ config NVME_TARGET_FCLOOP
 	  to test NVMe-FC transport interfaces.
 
 	  If unsure, say N.
+
+config NVME_TARGET_TCP
+	tristate "NVMe over Fabrics TCP target support"
+	depends on INET
+	depends on NVME_TARGET
+	help
+	  This enables the NVMe TCP target support, which allows exporting NVMe
+	  devices over TCP.
+
+	  If unsure, say N.
diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile
index 8118c93391c6f8..8c3ad0fb68605d 100644
--- a/drivers/nvme/target/Makefile
+++ b/drivers/nvme/target/Makefile
@@ -5,6 +5,7 @@ obj-$(CONFIG_NVME_TARGET_LOOP)		+= nvme-loop.o
 obj-$(CONFIG_NVME_TARGET_RDMA)		+= nvmet-rdma.o
 obj-$(CONFIG_NVME_TARGET_FC)		+= nvmet-fc.o
 obj-$(CONFIG_NVME_TARGET_FCLOOP)	+= nvme-fcloop.o
+obj-$(CONFIG_NVME_TARGET_TCP)		+= nvmet-tcp.o
 
 nvmet-y		+= core.o configfs.o admin-cmd.o fabrics-cmd.o \
 			discovery.o io-cmd-file.o io-cmd-bdev.o
@@ -12,3 +13,4 @@ nvme-loop-y	+= loop.o
 nvmet-rdma-y	+= rdma.o
 nvmet-fc-y	+= fc.o
 nvme-fcloop-y	+= fcloop.o
+nvmet-tcp-y	+= tcp.o
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
new file mode 100644
index 00000000000000..d31bec26016099
--- /dev/null
+++ b/drivers/nvme/target/tcp.c
@@ -0,0 +1,1737 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVMe over Fabrics TCP target.
+ * Copyright (c) 2018 Lightbits Labs. All rights reserved.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/nvme-tcp.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/inet.h>
+#include <linux/llist.h>
+#include <crypto/hash.h>
+
+#include "nvmet.h"
+
+#define NVMET_TCP_DEF_INLINE_DATA_SIZE	(4 * PAGE_SIZE)
+
+#define NVMET_TCP_RECV_BUDGET		8
+#define NVMET_TCP_SEND_BUDGET		8
+#define NVMET_TCP_IO_WORK_BUDGET	64
+
+enum nvmet_tcp_send_state {
+	NVMET_TCP_SEND_DATA_PDU,
+	NVMET_TCP_SEND_DATA,
+	NVMET_TCP_SEND_R2T,
+	NVMET_TCP_SEND_DDGST,
+	NVMET_TCP_SEND_RESPONSE
+};
+
+enum nvmet_tcp_recv_state {
+	NVMET_TCP_RECV_PDU,
+	NVMET_TCP_RECV_DATA,
+	NVMET_TCP_RECV_DDGST,
+	NVMET_TCP_RECV_ERR,
+};
+
+enum {
+	NVMET_TCP_F_INIT_FAILED = (1 << 0),
+};
+
+struct nvmet_tcp_cmd {
+	struct nvmet_tcp_queue		*queue;
+	struct nvmet_req		req;
+
+	struct nvme_tcp_cmd_pdu		*cmd_pdu;
+	struct nvme_tcp_rsp_pdu		*rsp_pdu;
+	struct nvme_tcp_data_pdu	*data_pdu;
+	struct nvme_tcp_r2t_pdu		*r2t_pdu;
+
+	u32				rbytes_done;
+	u32				wbytes_done;
+
+	u32				pdu_len;
+	u32				pdu_recv;
+	int				sg_idx;
+	int				nr_mapped;
+	struct msghdr			recv_msg;
+	struct kvec			*iov;
+	u32				flags;
+
+	struct list_head		entry;
+	struct llist_node		lentry;
+
+	/* send state */
+	u32				offset;
+	struct scatterlist		*cur_sg;
+	enum nvmet_tcp_send_state	state;
+
+	__le32				exp_ddgst;
+	__le32				recv_ddgst;
+};
+
+enum nvmet_tcp_queue_state {
+	NVMET_TCP_Q_CONNECTING,
+	NVMET_TCP_Q_LIVE,
+	NVMET_TCP_Q_DISCONNECTING,
+};
+
+struct nvmet_tcp_queue {
+	struct socket		*sock;
+	struct nvmet_tcp_port	*port;
+	struct work_struct	io_work;
+	int			cpu;
+	struct nvmet_cq		nvme_cq;
+	struct nvmet_sq		nvme_sq;
+
+	/* send state */
+	struct nvmet_tcp_cmd	*cmds;
+	unsigned int		nr_cmds;
+	struct list_head	free_list;
+	struct llist_head	resp_list;
+	struct list_head	resp_send_list;
+	int			send_list_len;
+	struct nvmet_tcp_cmd	*snd_cmd;
+
+	/* recv state */
+	int			offset;
+	int			left;
+	enum nvmet_tcp_recv_state rcv_state;
+	struct nvmet_tcp_cmd	*cmd;
+	union nvme_tcp_pdu	pdu;
+
+	/* digest state */
+	bool			hdr_digest;
+	bool			data_digest;
+	struct ahash_request	*snd_hash;
+	struct ahash_request	*rcv_hash;
+
+	spinlock_t		state_lock;
+	enum nvmet_tcp_queue_state state;
+
+	struct sockaddr_storage	sockaddr;
+	struct sockaddr_storage	sockaddr_peer;
+	struct work_struct	release_work;
+
+	int			idx;
+	struct list_head	queue_list;
+
+	struct nvmet_tcp_cmd	connect;
+
+	struct page_frag_cache	pf_cache;
+
+	void (*data_ready)(struct sock *);
+	void (*state_change)(struct sock *);
+	void (*write_space)(struct sock *);
+};
+
+struct nvmet_tcp_port {
+	struct socket		*sock;
+	struct work_struct	accept_work;
+	struct nvmet_port	*nport;
+	struct sockaddr_storage addr;
+	int			last_cpu;
+	void (*data_ready)(struct sock *);
+};
+
+static DEFINE_IDA(nvmet_tcp_queue_ida);
+static LIST_HEAD(nvmet_tcp_queue_list);
+static DEFINE_MUTEX(nvmet_tcp_queue_mutex);
+
+static struct workqueue_struct *nvmet_tcp_wq;
+static struct nvmet_fabrics_ops nvmet_tcp_ops;
+static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c);
+static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd);
+
+static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue,
+		struct nvmet_tcp_cmd *cmd)
+{
+	return cmd - queue->cmds;
+}
+
+static inline bool nvmet_tcp_has_data_in(struct nvmet_tcp_cmd *cmd)
+{
+	return nvme_is_write(cmd->req.cmd) &&
+		cmd->rbytes_done < cmd->req.transfer_len;
+}
+
+static inline bool nvmet_tcp_need_data_in(struct nvmet_tcp_cmd *cmd)
+{
+	return nvmet_tcp_has_data_in(cmd) && !cmd->req.rsp->status;
+}
+
+static inline bool nvmet_tcp_need_data_out(struct nvmet_tcp_cmd *cmd)
+{
+	return !nvme_is_write(cmd->req.cmd) &&
+		cmd->req.transfer_len > 0 &&
+		!cmd->req.rsp->status;
+}
+
+static inline bool nvmet_tcp_has_inline_data(struct nvmet_tcp_cmd *cmd)
+{
+	return nvme_is_write(cmd->req.cmd) && cmd->pdu_len &&
+		!cmd->rbytes_done;
+}
+
+static inline struct nvmet_tcp_cmd *
+nvmet_tcp_get_cmd(struct nvmet_tcp_queue *queue)
+{
+	struct nvmet_tcp_cmd *cmd;
+
+	cmd = list_first_entry_or_null(&queue->free_list,
+				struct nvmet_tcp_cmd, entry);
+	if (!cmd)
+		return NULL;
+	list_del_init(&cmd->entry);
+
+	cmd->rbytes_done = cmd->wbytes_done = 0;
+	cmd->pdu_len = 0;
+	cmd->pdu_recv = 0;
+	cmd->iov = NULL;
+	cmd->flags = 0;
+	return cmd;
+}
+
+static inline void nvmet_tcp_put_cmd(struct nvmet_tcp_cmd *cmd)
+{
+	if (unlikely(cmd == &cmd->queue->connect))
+		return;
+
+	list_add_tail(&cmd->entry, &cmd->queue->free_list);
+}
+
+static inline u8 nvmet_tcp_hdgst_len(struct nvmet_tcp_queue *queue)
+{
+	return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
+}
+
+static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue)
+{
+	return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
+}
+
+static inline void nvmet_tcp_hdgst(struct ahash_request *hash,
+		void *pdu, size_t len)
+{
+	struct scatterlist sg;
+
+	sg_init_one(&sg, pdu, len);
+	ahash_request_set_crypt(hash, &sg, pdu + len, len);
+	crypto_ahash_digest(hash);
+}
+
+static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue,
+	void *pdu, size_t len)
+{
+	struct nvme_tcp_hdr *hdr = pdu;
+	__le32 recv_digest;
+	__le32 exp_digest;
+
+	if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
+		pr_err("queue %d: header digest enabled but no header digest\n",
+			queue->idx);
+		return -EPROTO;
+	}
+
+	recv_digest = *(__le32 *)(pdu + hdr->hlen);
+	nvmet_tcp_hdgst(queue->rcv_hash, pdu, len);
+	exp_digest = *(__le32 *)(pdu + hdr->hlen);
+	if (recv_digest != exp_digest) {
+		pr_err("queue %d: header digest error: recv %#x expected %#x\n",
+			queue->idx, le32_to_cpu(recv_digest),
+			le32_to_cpu(exp_digest));
+		return -EPROTO;
+	}
+
+	return 0;
+}
+
+static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue *queue, void *pdu)
+{
+	struct nvme_tcp_hdr *hdr = pdu;
+	u8 digest_len = nvmet_tcp_hdgst_len(queue);
+	u32 len;
+
+	len = le32_to_cpu(hdr->plen) - hdr->hlen -
+		(hdr->flags & NVME_TCP_F_HDGST ? digest_len : 0);
+
+	if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
+		pr_err("queue %d: data digest flag is cleared\n", queue->idx);
+		return -EPROTO;
+	}
+
+	return 0;
+}
+
+static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd)
+{
+	struct scatterlist *sg;
+	int i;
+
+	sg = &cmd->req.sg[cmd->sg_idx];
+
+	for (i = 0; i < cmd->nr_mapped; i++)
+		kunmap(sg_page(&sg[i]));
+}
+
+static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd *cmd)
+{
+	struct kvec *iov = cmd->iov;
+	struct scatterlist *sg;
+	u32 length, offset, sg_offset;
+
+	length = cmd->pdu_len;
+	cmd->nr_mapped = DIV_ROUND_UP(length, PAGE_SIZE);
+	offset = cmd->rbytes_done;
+	cmd->sg_idx = DIV_ROUND_UP(offset, PAGE_SIZE);
+	sg_offset = offset % PAGE_SIZE;
+	sg = &cmd->req.sg[cmd->sg_idx];
+
+	while (length) {
+		u32 iov_len = min_t(u32, length, sg->length - sg_offset);
+
+		iov->iov_base = kmap(sg_page(sg)) + sg->offset + sg_offset;
+		iov->iov_len = iov_len;
+
+		length -= iov_len;
+		sg = sg_next(sg);
+		iov++;
+	}
+
+	iov_iter_kvec(&cmd->recv_msg.msg_iter, READ, cmd->iov,
+		cmd->nr_mapped, cmd->pdu_len);
+}
+
+static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue)
+{
+	queue->rcv_state = NVMET_TCP_RECV_ERR;
+	if (queue->nvme_sq.ctrl)
+		nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl);
+	else
+		kernel_sock_shutdown(queue->sock, SHUT_RDWR);
+}
+
+static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd)
+{
+	struct nvme_sgl_desc *sgl = &cmd->req.cmd->common.dptr.sgl;
+	u32 len = le32_to_cpu(sgl->length);
+
+	if (!cmd->req.data_len)
+		return 0;
+
+	if (sgl->type == ((NVME_SGL_FMT_DATA_DESC << 4) |
+			  NVME_SGL_FMT_OFFSET)) {
+		if (!nvme_is_write(cmd->req.cmd))
+			return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+
+		if (len > cmd->req.port->inline_data_size)
+			return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR;
+		cmd->pdu_len = len;
+	}
+	cmd->req.transfer_len += len;
+
+	cmd->req.sg = sgl_alloc(len, GFP_KERNEL, &cmd->req.sg_cnt);
+	if (!cmd->req.sg)
+		return NVME_SC_INTERNAL;
+	cmd->cur_sg = cmd->req.sg;
+
+	if (nvmet_tcp_has_data_in(cmd)) {
+		cmd->iov = kmalloc_array(cmd->req.sg_cnt,
+				sizeof(*cmd->iov), GFP_KERNEL);
+		if (!cmd->iov)
+			goto err;
+	}
+
+	return 0;
+err:
+	sgl_free(cmd->req.sg);
+	return NVME_SC_INTERNAL;
+}
+
+static void nvmet_tcp_ddgst(struct ahash_request *hash,
+		struct nvmet_tcp_cmd *cmd)
+{
+	ahash_request_set_crypt(hash, cmd->req.sg,
+		(void *)&cmd->exp_ddgst, cmd->req.transfer_len);
+	crypto_ahash_digest(hash);
+}
+
+static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd)
+{
+	struct nvme_tcp_data_pdu *pdu = cmd->data_pdu;
+	struct nvmet_tcp_queue *queue = cmd->queue;
+	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
+	u8 ddgst = nvmet_tcp_ddgst_len(cmd->queue);
+
+	cmd->offset = 0;
+	cmd->state = NVMET_TCP_SEND_DATA_PDU;
+
+	pdu->hdr.type = nvme_tcp_c2h_data;
+	pdu->hdr.flags = NVME_TCP_F_DATA_LAST;
+	pdu->hdr.hlen = sizeof(*pdu);
+	pdu->hdr.pdo = pdu->hdr.hlen + hdgst;
+	pdu->hdr.plen =
+		cpu_to_le32(pdu->hdr.hlen + hdgst +
+				cmd->req.transfer_len + ddgst);
+	pdu->command_id = cmd->req.rsp->command_id;
+	pdu->data_length = cpu_to_le32(cmd->req.transfer_len);
+	pdu->data_offset = cpu_to_le32(cmd->wbytes_done);
+
+	if (queue->data_digest) {
+		pdu->hdr.flags |= NVME_TCP_F_DDGST;
+		nvmet_tcp_ddgst(queue->snd_hash, cmd);
+	}
+
+	if (cmd->queue->hdr_digest) {
+		pdu->hdr.flags |= NVME_TCP_F_HDGST;
+		nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
+	}
+}
+
+static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd)
+{
+	struct nvme_tcp_r2t_pdu *pdu = cmd->r2t_pdu;
+	struct nvmet_tcp_queue *queue = cmd->queue;
+	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
+
+	cmd->offset = 0;
+	cmd->state = NVMET_TCP_SEND_R2T;
+
+	pdu->hdr.type = nvme_tcp_r2t;
+	pdu->hdr.flags = 0;
+	pdu->hdr.hlen = sizeof(*pdu);
+	pdu->hdr.pdo = 0;
+	pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
+
+	pdu->command_id = cmd->req.cmd->common.command_id;
+	pdu->ttag = nvmet_tcp_cmd_tag(cmd->queue, cmd);
+	pdu->r2t_length = cpu_to_le32(cmd->req.transfer_len - cmd->rbytes_done);
+	pdu->r2t_offset = cpu_to_le32(cmd->rbytes_done);
+	if (cmd->queue->hdr_digest) {
+		pdu->hdr.flags |= NVME_TCP_F_HDGST;
+		nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
+	}
+}
+
+static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd)
+{
+	struct nvme_tcp_rsp_pdu *pdu = cmd->rsp_pdu;
+	struct nvmet_tcp_queue *queue = cmd->queue;
+	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
+
+	cmd->offset = 0;
+	cmd->state = NVMET_TCP_SEND_RESPONSE;
+
+	pdu->hdr.type = nvme_tcp_rsp;
+	pdu->hdr.flags = 0;
+	pdu->hdr.hlen = sizeof(*pdu);
+	pdu->hdr.pdo = 0;
+	pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
+	if (cmd->queue->hdr_digest) {
+		pdu->hdr.flags |= NVME_TCP_F_HDGST;
+		nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
+	}
+}
+
+static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue *queue)
+{
+	struct llist_node *node;
+
+	node = llist_del_all(&queue->resp_list);
+	if (!node)
+		return;
+
+	while (node) {
+		struct nvmet_tcp_cmd *cmd = llist_entry(node,
+					struct nvmet_tcp_cmd, lentry);
+
+		list_add(&cmd->entry, &queue->resp_send_list);
+		node = node->next;
+		queue->send_list_len++;
+	}
+}
+
+static struct nvmet_tcp_cmd *nvmet_tcp_fetch_cmd(struct nvmet_tcp_queue *queue)
+{
+	queue->snd_cmd = list_first_entry_or_null(&queue->resp_send_list,
+				struct nvmet_tcp_cmd, entry);
+	if (!queue->snd_cmd) {
+		nvmet_tcp_process_resp_list(queue);
+		queue->snd_cmd =
+			list_first_entry_or_null(&queue->resp_send_list,
+					struct nvmet_tcp_cmd, entry);
+		if (unlikely(!queue->snd_cmd))
+			return NULL;
+	}
+
+	list_del_init(&queue->snd_cmd->entry);
+	queue->send_list_len--;
+
+	if (nvmet_tcp_need_data_out(queue->snd_cmd))
+		nvmet_setup_c2h_data_pdu(queue->snd_cmd);
+	else if (nvmet_tcp_need_data_in(queue->snd_cmd))
+		nvmet_setup_r2t_pdu(queue->snd_cmd);
+	else
+		nvmet_setup_response_pdu(queue->snd_cmd);
+
+	return queue->snd_cmd;
+}
+
+static void nvmet_tcp_queue_response(struct nvmet_req *req)
+{
+	struct nvmet_tcp_cmd *cmd =
+		container_of(req, struct nvmet_tcp_cmd, req);
+	struct nvmet_tcp_queue	*queue = cmd->queue;
+
+	llist_add(&cmd->lentry, &queue->resp_list);
+	queue_work_on(cmd->queue->cpu, nvmet_tcp_wq, &cmd->queue->io_work);
+}
+
+static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd)
+{
+	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
+	int left = sizeof(*cmd->data_pdu) - cmd->offset + hdgst;
+	int ret;
+
+	ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->data_pdu),
+			offset_in_page(cmd->data_pdu) + cmd->offset,
+			left, MSG_DONTWAIT | MSG_MORE);
+	if (ret <= 0)
+		return ret;
+
+	cmd->offset += ret;
+	left -= ret;
+
+	if (left)
+		return -EAGAIN;
+
+	cmd->state = NVMET_TCP_SEND_DATA;
+	cmd->offset  = 0;
+	return 1;
+}
+
+static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd)
+{
+	struct nvmet_tcp_queue *queue = cmd->queue;
+	int ret;
+
+	while (cmd->cur_sg) {
+		struct page *page = sg_page(cmd->cur_sg);
+		u32 left = cmd->cur_sg->length - cmd->offset;
+
+		ret = kernel_sendpage(cmd->queue->sock, page, cmd->offset,
+					left, MSG_DONTWAIT | MSG_MORE);
+		if (ret <= 0)
+			return ret;
+
+		cmd->offset += ret;
+		cmd->wbytes_done += ret;
+
+		/* Done with sg?*/
+		if (cmd->offset == cmd->cur_sg->length) {
+			cmd->cur_sg = sg_next(cmd->cur_sg);
+			cmd->offset = 0;
+		}
+	}
+
+	if (queue->data_digest) {
+		cmd->state = NVMET_TCP_SEND_DDGST;
+		cmd->offset = 0;
+	} else {
+		nvmet_setup_response_pdu(cmd);
+	}
+	return 1;
+
+}
+
+static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,
+		bool last_in_batch)
+{
+	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
+	int left = sizeof(*cmd->rsp_pdu) - cmd->offset + hdgst;
+	int flags = MSG_DONTWAIT;
+	int ret;
+
+	if (!last_in_batch && cmd->queue->send_list_len)
+		flags |= MSG_MORE;
+	else
+		flags |= MSG_EOR;
+
+	ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->rsp_pdu),
+		offset_in_page(cmd->rsp_pdu) + cmd->offset, left, flags);
+	if (ret <= 0)
+		return ret;
+	cmd->offset += ret;
+	left -= ret;
+
+	if (left)
+		return -EAGAIN;
+
+	kfree(cmd->iov);
+	sgl_free(cmd->req.sg);
+	cmd->queue->snd_cmd = NULL;
+	nvmet_tcp_put_cmd(cmd);
+	return 1;
+}
+
+static int nvmet_try_send_r2t(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
+{
+	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
+	int left = sizeof(*cmd->r2t_pdu) - cmd->offset + hdgst;
+	int flags = MSG_DONTWAIT;
+	int ret;
+
+	if (!last_in_batch && cmd->queue->send_list_len)
+		flags |= MSG_MORE;
+	else
+		flags |= MSG_EOR;
+
+	ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->r2t_pdu),
+		offset_in_page(cmd->r2t_pdu) + cmd->offset, left, flags);
+	if (ret <= 0)
+		return ret;
+	cmd->offset += ret;
+	left -= ret;
+
+	if (left)
+		return -EAGAIN;
+
+	cmd->queue->snd_cmd = NULL;
+	return 1;
+}
+
+static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd)
+{
+	struct nvmet_tcp_queue *queue = cmd->queue;
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
+	struct kvec iov = {
+		.iov_base = &cmd->exp_ddgst + cmd->offset,
+		.iov_len = NVME_TCP_DIGEST_LENGTH - cmd->offset
+	};
+	int ret;
+
+	ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
+	if (unlikely(ret <= 0))
+		return ret;
+
+	cmd->offset += ret;
+	nvmet_setup_response_pdu(cmd);
+	return 1;
+}
+
+static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue *queue,
+		bool last_in_batch)
+{
+	struct nvmet_tcp_cmd *cmd = queue->snd_cmd;
+	int ret = 0;
+
+	if (!cmd || queue->state == NVMET_TCP_Q_DISCONNECTING) {
+		cmd = nvmet_tcp_fetch_cmd(queue);
+		if (unlikely(!cmd))
+			return 0;
+	}
+
+	if (cmd->state == NVMET_TCP_SEND_DATA_PDU) {
+		ret = nvmet_try_send_data_pdu(cmd);
+		if (ret <= 0)
+			goto done_send;
+	}
+
+	if (cmd->state == NVMET_TCP_SEND_DATA) {
+		ret = nvmet_try_send_data(cmd);
+		if (ret <= 0)
+			goto done_send;
+	}
+
+	if (cmd->state == NVMET_TCP_SEND_DDGST) {
+		ret = nvmet_try_send_ddgst(cmd);
+		if (ret <= 0)
+			goto done_send;
+	}
+
+	if (cmd->state == NVMET_TCP_SEND_R2T) {
+		ret = nvmet_try_send_r2t(cmd, last_in_batch);
+		if (ret <= 0)
+			goto done_send;
+	}
+
+	if (cmd->state == NVMET_TCP_SEND_RESPONSE)
+		ret = nvmet_try_send_response(cmd, last_in_batch);
+
+done_send:
+	if (ret < 0) {
+		if (ret == -EAGAIN)
+			return 0;
+		return ret;
+	}
+
+	return 1;
+}
+
+static int nvmet_tcp_try_send(struct nvmet_tcp_queue *queue,
+		int budget, int *sends)
+{
+	int i, ret = 0;
+
+	for (i = 0; i < budget; i++) {
+		ret = nvmet_tcp_try_send_one(queue, i == budget - 1);
+		if (ret <= 0)
+			break;
+		(*sends)++;
+	}
+
+	return ret;
+}
+
+static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue *queue)
+{
+	queue->offset = 0;
+	queue->left = sizeof(struct nvme_tcp_hdr);
+	queue->cmd = NULL;
+	queue->rcv_state = NVMET_TCP_RECV_PDU;
+}
+
+static void nvmet_tcp_free_crypto(struct nvmet_tcp_queue *queue)
+{
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
+
+	ahash_request_free(queue->rcv_hash);
+	ahash_request_free(queue->snd_hash);
+	crypto_free_ahash(tfm);
+}
+
+static int nvmet_tcp_alloc_crypto(struct nvmet_tcp_queue *queue)
+{
+	struct crypto_ahash *tfm;
+
+	tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
+	if (!queue->snd_hash)
+		goto free_tfm;
+	ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
+
+	queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
+	if (!queue->rcv_hash)
+		goto free_snd_hash;
+	ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
+
+	return 0;
+free_snd_hash:
+	ahash_request_free(queue->snd_hash);
+free_tfm:
+	crypto_free_ahash(tfm);
+	return -ENOMEM;
+}
+
+
+static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue)
+{
+	struct nvme_tcp_icreq_pdu *icreq = &queue->pdu.icreq;
+	struct nvme_tcp_icresp_pdu *icresp = &queue->pdu.icresp;
+	struct msghdr msg = {};
+	struct kvec iov;
+	int ret;
+
+	if (le32_to_cpu(icreq->hdr.plen) != sizeof(struct nvme_tcp_icreq_pdu)) {
+		pr_err("bad nvme-tcp pdu length (%d)\n",
+			le32_to_cpu(icreq->hdr.plen));
+		nvmet_tcp_fatal_error(queue);
+	}
+
+	if (icreq->pfv != NVME_TCP_PFV_1_0) {
+		pr_err("queue %d: bad pfv %d\n", queue->idx, icreq->pfv);
+		return -EPROTO;
+	}
+
+	if (icreq->hpda != 0) {
+		pr_err("queue %d: unsupported hpda %d\n", queue->idx,
+			icreq->hpda);
+		return -EPROTO;
+	}
+
+	if (icreq->maxr2t != 0) {
+		pr_err("queue %d: unsupported maxr2t %d\n", queue->idx,
+			le16_to_cpu(icreq->maxr2t) + 1);
+		return -EPROTO;
+	}
+
+	queue->hdr_digest = !!(icreq->digest & NVME_TCP_HDR_DIGEST_ENABLE);
+	queue->data_digest = !!(icreq->digest & NVME_TCP_DATA_DIGEST_ENABLE);
+	if (queue->hdr_digest || queue->data_digest) {
+		ret = nvmet_tcp_alloc_crypto(queue);
+		if (ret)
+			return ret;
+	}
+
+	memset(icresp, 0, sizeof(*icresp));
+	icresp->hdr.type = nvme_tcp_icresp;
+	icresp->hdr.hlen = sizeof(*icresp);
+	icresp->hdr.pdo = 0;
+	icresp->hdr.plen = cpu_to_le32(icresp->hdr.hlen);
+	icresp->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
+	icresp->maxdata = 0xffff; /* FIXME: support r2t */
+	icresp->cpda = 0;
+	if (queue->hdr_digest)
+		icresp->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
+	if (queue->data_digest)
+		icresp->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
+
+	iov.iov_base = icresp;
+	iov.iov_len = sizeof(*icresp);
+	ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
+	if (ret < 0)
+		goto free_crypto;
+
+	queue->state = NVMET_TCP_Q_LIVE;
+	nvmet_prepare_receive_pdu(queue);
+	return 0;
+free_crypto:
+	if (queue->hdr_digest || queue->data_digest)
+		nvmet_tcp_free_crypto(queue);
+	return ret;
+}
+
+static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue,
+		struct nvmet_tcp_cmd *cmd, struct nvmet_req *req)
+{
+	int ret;
+
+	/* recover the expected data transfer length */
+	req->data_len = le32_to_cpu(req->cmd->common.dptr.sgl.length);
+
+	if (!nvme_is_write(cmd->req.cmd) ||
+	    req->data_len > cmd->req.port->inline_data_size) {
+		nvmet_prepare_receive_pdu(queue);
+		return;
+	}
+
+	ret = nvmet_tcp_map_data(cmd);
+	if (unlikely(ret)) {
+		pr_err("queue %d: failed to map data\n", queue->idx);
+		nvmet_tcp_fatal_error(queue);
+		return;
+	}
+
+	queue->rcv_state = NVMET_TCP_RECV_DATA;
+	nvmet_tcp_map_pdu_iovec(cmd);
+	cmd->flags |= NVMET_TCP_F_INIT_FAILED;
+}
+
+static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue)
+{
+	struct nvme_tcp_data_pdu *data = &queue->pdu.data;
+	struct nvmet_tcp_cmd *cmd;
+
+	cmd = &queue->cmds[data->ttag];
+
+	if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) {
+		pr_err("ttag %u unexpected data offset %u (expected %u)\n",
+			data->ttag, le32_to_cpu(data->data_offset),
+			cmd->rbytes_done);
+		/* FIXME: use path and transport errors */
+		nvmet_req_complete(&cmd->req,
+			NVME_SC_INVALID_FIELD | NVME_SC_DNR);
+		return -EPROTO;
+	}
+
+	cmd->pdu_len = le32_to_cpu(data->data_length);
+	cmd->pdu_recv = 0;
+	nvmet_tcp_map_pdu_iovec(cmd);
+	queue->cmd = cmd;
+	queue->rcv_state = NVMET_TCP_RECV_DATA;
+
+	return 0;
+}
+
+static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue)
+{
+	struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
+	struct nvme_command *nvme_cmd = &queue->pdu.cmd.cmd;
+	struct nvmet_req *req;
+	int ret;
+
+	if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
+		if (hdr->type != nvme_tcp_icreq) {
+			pr_err("unexpected pdu type (%d) before icreq\n",
+				hdr->type);
+			nvmet_tcp_fatal_error(queue);
+			return -EPROTO;
+		}
+		return nvmet_tcp_handle_icreq(queue);
+	}
+
+	if (hdr->type == nvme_tcp_h2c_data) {
+		ret = nvmet_tcp_handle_h2c_data_pdu(queue);
+		if (unlikely(ret))
+			return ret;
+		return 0;
+	}
+
+	queue->cmd = nvmet_tcp_get_cmd(queue);
+	if (unlikely(!queue->cmd)) {
+		/* This should never happen */
+		pr_err("queue %d: out of commands (%d) send_list_len: %d, opcode: %d",
+			queue->idx, queue->nr_cmds, queue->send_list_len,
+			nvme_cmd->common.opcode);
+		nvmet_tcp_fatal_error(queue);
+		return -ENOMEM;
+	}
+
+	req = &queue->cmd->req;
+	memcpy(req->cmd, nvme_cmd, sizeof(*nvme_cmd));
+
+	if (unlikely(!nvmet_req_init(req, &queue->nvme_cq,
+			&queue->nvme_sq, &nvmet_tcp_ops))) {
+		pr_err("failed cmd %p id %d opcode %d, data_len: %d\n",
+			req->cmd, req->cmd->common.command_id,
+			req->cmd->common.opcode,
+			le32_to_cpu(req->cmd->common.dptr.sgl.length));
+
+		nvmet_tcp_handle_req_failure(queue, queue->cmd, req);
+		return -EAGAIN;
+	}
+
+	ret = nvmet_tcp_map_data(queue->cmd);
+	if (unlikely(ret)) {
+		pr_err("queue %d: failed to map data\n", queue->idx);
+		if (nvmet_tcp_has_inline_data(queue->cmd))
+			nvmet_tcp_fatal_error(queue);
+		else
+			nvmet_req_complete(req, ret);
+		ret = -EAGAIN;
+		goto out;
+	}
+
+	if (nvmet_tcp_need_data_in(queue->cmd)) {
+		if (nvmet_tcp_has_inline_data(queue->cmd)) {
+			queue->rcv_state = NVMET_TCP_RECV_DATA;
+			nvmet_tcp_map_pdu_iovec(queue->cmd);
+			return 0;
+		}
+		/* send back R2T */
+		nvmet_tcp_queue_response(&queue->cmd->req);
+		goto out;
+	}
+
+	nvmet_req_execute(&queue->cmd->req);
+out:
+	nvmet_prepare_receive_pdu(queue);
+	return ret;
+}
+
+static const u8 nvme_tcp_pdu_sizes[] = {
+	[nvme_tcp_icreq]	= sizeof(struct nvme_tcp_icreq_pdu),
+	[nvme_tcp_cmd]		= sizeof(struct nvme_tcp_cmd_pdu),
+	[nvme_tcp_h2c_data]	= sizeof(struct nvme_tcp_data_pdu),
+};
+
+static inline u8 nvmet_tcp_pdu_size(u8 type)
+{
+	size_t idx = type;
+
+	return (idx < ARRAY_SIZE(nvme_tcp_pdu_sizes) &&
+		nvme_tcp_pdu_sizes[idx]) ?
+			nvme_tcp_pdu_sizes[idx] : 0;
+}
+
+static inline bool nvmet_tcp_pdu_valid(u8 type)
+{
+	switch (type) {
+	case nvme_tcp_icreq:
+	case nvme_tcp_cmd:
+	case nvme_tcp_h2c_data:
+		/* fallthru */
+		return true;
+	}
+
+	return false;
+}
+
+static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue *queue)
+{
+	struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
+	int len;
+	struct kvec iov;
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
+
+recv:
+	iov.iov_base = (void *)&queue->pdu + queue->offset;
+	iov.iov_len = queue->left;
+	len = kernel_recvmsg(queue->sock, &msg, &iov, 1,
+			iov.iov_len, msg.msg_flags);
+	if (unlikely(len < 0))
+		return len;
+
+	queue->offset += len;
+	queue->left -= len;
+	if (queue->left)
+		return -EAGAIN;
+
+	if (queue->offset == sizeof(struct nvme_tcp_hdr)) {
+		u8 hdgst = nvmet_tcp_hdgst_len(queue);
+
+		if (unlikely(!nvmet_tcp_pdu_valid(hdr->type))) {
+			pr_err("unexpected pdu type %d\n", hdr->type);
+			nvmet_tcp_fatal_error(queue);
+			return -EIO;
+		}
+
+		if (unlikely(hdr->hlen != nvmet_tcp_pdu_size(hdr->type))) {
+			pr_err("pdu %d bad hlen %d\n", hdr->type, hdr->hlen);
+			return -EIO;
+		}
+
+		queue->left = hdr->hlen - queue->offset + hdgst;
+		goto recv;
+	}
+
+	if (queue->hdr_digest &&
+	    nvmet_tcp_verify_hdgst(queue, &queue->pdu, queue->offset)) {
+		nvmet_tcp_fatal_error(queue); /* fatal */
+		return -EPROTO;
+	}
+
+	if (queue->data_digest &&
+	    nvmet_tcp_check_ddgst(queue, &queue->pdu)) {
+		nvmet_tcp_fatal_error(queue); /* fatal */
+		return -EPROTO;
+	}
+
+	return nvmet_tcp_done_recv_pdu(queue);
+}
+
+static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd)
+{
+	struct nvmet_tcp_queue *queue = cmd->queue;
+
+	nvmet_tcp_ddgst(queue->rcv_hash, cmd);
+	queue->offset = 0;
+	queue->left = NVME_TCP_DIGEST_LENGTH;
+	queue->rcv_state = NVMET_TCP_RECV_DDGST;
+}
+
+static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue)
+{
+	struct nvmet_tcp_cmd  *cmd = queue->cmd;
+	int ret;
+
+	while (msg_data_left(&cmd->recv_msg)) {
+		ret = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg,
+			cmd->recv_msg.msg_flags);
+		if (ret <= 0)
+			return ret;
+
+		cmd->pdu_recv += ret;
+		cmd->rbytes_done += ret;
+	}
+
+	nvmet_tcp_unmap_pdu_iovec(cmd);
+
+	if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) &&
+	    cmd->rbytes_done == cmd->req.transfer_len) {
+		if (queue->data_digest) {
+			nvmet_tcp_prep_recv_ddgst(cmd);
+			return 0;
+		}
+		nvmet_req_execute(&cmd->req);
+	}
+
+	nvmet_prepare_receive_pdu(queue);
+	return 0;
+}
+
+static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue)
+{
+	struct nvmet_tcp_cmd *cmd = queue->cmd;
+	int ret;
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
+	struct kvec iov = {
+		.iov_base = (void *)&cmd->recv_ddgst + queue->offset,
+		.iov_len = queue->left
+	};
+
+	ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
+			iov.iov_len, msg.msg_flags);
+	if (unlikely(ret < 0))
+		return ret;
+
+	queue->offset += ret;
+	queue->left -= ret;
+	if (queue->left)
+		return -EAGAIN;
+
+	if (queue->data_digest && cmd->exp_ddgst != cmd->recv_ddgst) {
+		pr_err("queue %d: cmd %d pdu (%d) data digest error: recv %#x expected %#x\n",
+			queue->idx, cmd->req.cmd->common.command_id,
+			queue->pdu.cmd.hdr.type, le32_to_cpu(cmd->recv_ddgst),
+			le32_to_cpu(cmd->exp_ddgst));
+		nvmet_tcp_finish_cmd(cmd);
+		nvmet_tcp_fatal_error(queue);
+		ret = -EPROTO;
+		goto out;
+	}
+
+	if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) &&
+	    cmd->rbytes_done == cmd->req.transfer_len)
+		nvmet_req_execute(&cmd->req);
+	ret = 0;
+out:
+	nvmet_prepare_receive_pdu(queue);
+	return ret;
+}
+
+static int nvmet_tcp_try_recv_one(struct nvmet_tcp_queue *queue)
+{
+	int result;
+
+	if (unlikely(queue->rcv_state == NVMET_TCP_RECV_ERR))
+		return 0;
+
+	if (queue->rcv_state == NVMET_TCP_RECV_PDU) {
+		result = nvmet_tcp_try_recv_pdu(queue);
+		if (result != 0)
+			goto done_recv;
+	}
+
+	if (queue->rcv_state == NVMET_TCP_RECV_DATA) {
+		result = nvmet_tcp_try_recv_data(queue);
+		if (result != 0)
+			goto done_recv;
+	}
+
+	if (queue->rcv_state == NVMET_TCP_RECV_DDGST) {
+		result = nvmet_tcp_try_recv_ddgst(queue);
+		if (result != 0)
+			goto done_recv;
+	}
+
+done_recv:
+	if (result < 0) {
+		if (result == -EAGAIN)
+			return 0;
+		return result;
+	}
+	return 1;
+}
+
+static int nvmet_tcp_try_recv(struct nvmet_tcp_queue *queue,
+		int budget, int *recvs)
+{
+	int i, ret = 0;
+
+	for (i = 0; i < budget; i++) {
+		ret = nvmet_tcp_try_recv_one(queue);
+		if (ret <= 0)
+			break;
+		(*recvs)++;
+	}
+
+	return ret;
+}
+
+static void nvmet_tcp_schedule_release_queue(struct nvmet_tcp_queue *queue)
+{
+	spin_lock(&queue->state_lock);
+	if (queue->state != NVMET_TCP_Q_DISCONNECTING) {
+		queue->state = NVMET_TCP_Q_DISCONNECTING;
+		schedule_work(&queue->release_work);
+	}
+	spin_unlock(&queue->state_lock);
+}
+
+static void nvmet_tcp_io_work(struct work_struct *w)
+{
+	struct nvmet_tcp_queue *queue =
+		container_of(w, struct nvmet_tcp_queue, io_work);
+	bool pending;
+	int ret, ops = 0;
+
+	do {
+		pending = false;
+
+		ret = nvmet_tcp_try_recv(queue, NVMET_TCP_RECV_BUDGET, &ops);
+		if (ret > 0) {
+			pending = true;
+		} else if (ret < 0) {
+			if (ret == -EPIPE || ret == -ECONNRESET)
+				kernel_sock_shutdown(queue->sock, SHUT_RDWR);
+			else
+				nvmet_tcp_fatal_error(queue);
+			return;
+		}
+
+		ret = nvmet_tcp_try_send(queue, NVMET_TCP_SEND_BUDGET, &ops);
+		if (ret > 0) {
+			/* transmitted message/data */
+			pending = true;
+		} else if (ret < 0) {
+			if (ret == -EPIPE || ret == -ECONNRESET)
+				kernel_sock_shutdown(queue->sock, SHUT_RDWR);
+			else
+				nvmet_tcp_fatal_error(queue);
+			return;
+		}
+
+	} while (pending && ops < NVMET_TCP_IO_WORK_BUDGET);
+
+	/*
+	 * We exahusted our budget, requeue our selves
+	 */
+	if (pending)
+		queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
+}
+
+static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue,
+		struct nvmet_tcp_cmd *c)
+{
+	u8 hdgst = nvmet_tcp_hdgst_len(queue);
+
+	c->queue = queue;
+	c->req.port = queue->port->nport;
+
+	c->cmd_pdu = page_frag_alloc(&queue->pf_cache,
+			sizeof(*c->cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
+	if (!c->cmd_pdu)
+		return -ENOMEM;
+	c->req.cmd = &c->cmd_pdu->cmd;
+
+	c->rsp_pdu = page_frag_alloc(&queue->pf_cache,
+			sizeof(*c->rsp_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
+	if (!c->rsp_pdu)
+		goto out_free_cmd;
+	c->req.rsp = &c->rsp_pdu->cqe;
+
+	c->data_pdu = page_frag_alloc(&queue->pf_cache,
+			sizeof(*c->data_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
+	if (!c->data_pdu)
+		goto out_free_rsp;
+
+	c->r2t_pdu = page_frag_alloc(&queue->pf_cache,
+			sizeof(*c->r2t_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
+	if (!c->r2t_pdu)
+		goto out_free_data;
+
+	c->recv_msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
+
+	list_add_tail(&c->entry, &queue->free_list);
+
+	return 0;
+out_free_data:
+	page_frag_free(c->data_pdu);
+out_free_rsp:
+	page_frag_free(c->rsp_pdu);
+out_free_cmd:
+	page_frag_free(c->cmd_pdu);
+	return -ENOMEM;
+}
+
+static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c)
+{
+	page_frag_free(c->r2t_pdu);
+	page_frag_free(c->data_pdu);
+	page_frag_free(c->rsp_pdu);
+	page_frag_free(c->cmd_pdu);
+}
+
+static int nvmet_tcp_alloc_cmds(struct nvmet_tcp_queue *queue)
+{
+	struct nvmet_tcp_cmd *cmds;
+	int i, ret = -EINVAL, nr_cmds = queue->nr_cmds;
+
+	cmds = kcalloc(nr_cmds, sizeof(struct nvmet_tcp_cmd), GFP_KERNEL);
+	if (!cmds)
+		goto out;
+
+	for (i = 0; i < nr_cmds; i++) {
+		ret = nvmet_tcp_alloc_cmd(queue, cmds + i);
+		if (ret)
+			goto out_free;
+	}
+
+	queue->cmds = cmds;
+
+	return 0;
+out_free:
+	while (--i >= 0)
+		nvmet_tcp_free_cmd(cmds + i);
+	kfree(cmds);
+out:
+	return ret;
+}
+
+static void nvmet_tcp_free_cmds(struct nvmet_tcp_queue *queue)
+{
+	struct nvmet_tcp_cmd *cmds = queue->cmds;
+	int i;
+
+	for (i = 0; i < queue->nr_cmds; i++)
+		nvmet_tcp_free_cmd(cmds + i);
+
+	nvmet_tcp_free_cmd(&queue->connect);
+	kfree(cmds);
+}
+
+static void nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue)
+{
+	struct socket *sock = queue->sock;
+
+	write_lock_bh(&sock->sk->sk_callback_lock);
+	sock->sk->sk_data_ready =  queue->data_ready;
+	sock->sk->sk_state_change = queue->state_change;
+	sock->sk->sk_write_space = queue->write_space;
+	sock->sk->sk_user_data = NULL;
+	write_unlock_bh(&sock->sk->sk_callback_lock);
+}
+
+static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd)
+{
+	nvmet_req_uninit(&cmd->req);
+	nvmet_tcp_unmap_pdu_iovec(cmd);
+	sgl_free(cmd->req.sg);
+}
+
+static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue)
+{
+	struct nvmet_tcp_cmd *cmd = queue->cmds;
+	int i;
+
+	for (i = 0; i < queue->nr_cmds; i++, cmd++) {
+		if (nvmet_tcp_need_data_in(cmd))
+			nvmet_tcp_finish_cmd(cmd);
+	}
+
+	if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect)) {
+		/* failed in connect */
+		nvmet_tcp_finish_cmd(&queue->connect);
+	}
+}
+
+static void nvmet_tcp_release_queue_work(struct work_struct *w)
+{
+	struct nvmet_tcp_queue *queue =
+		container_of(w, struct nvmet_tcp_queue, release_work);
+
+	mutex_lock(&nvmet_tcp_queue_mutex);
+	list_del_init(&queue->queue_list);
+	mutex_unlock(&nvmet_tcp_queue_mutex);
+
+	nvmet_tcp_restore_socket_callbacks(queue);
+	flush_work(&queue->io_work);
+
+	nvmet_tcp_uninit_data_in_cmds(queue);
+	nvmet_sq_destroy(&queue->nvme_sq);
+	cancel_work_sync(&queue->io_work);
+	sock_release(queue->sock);
+	nvmet_tcp_free_cmds(queue);
+	if (queue->hdr_digest || queue->data_digest)
+		nvmet_tcp_free_crypto(queue);
+	ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx);
+
+	kfree(queue);
+}
+
+static void nvmet_tcp_data_ready(struct sock *sk)
+{
+	struct nvmet_tcp_queue *queue;
+
+	read_lock_bh(&sk->sk_callback_lock);
+	queue = sk->sk_user_data;
+	if (likely(queue))
+		queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void nvmet_tcp_write_space(struct sock *sk)
+{
+	struct nvmet_tcp_queue *queue;
+
+	read_lock_bh(&sk->sk_callback_lock);
+	queue = sk->sk_user_data;
+	if (unlikely(!queue))
+		goto out;
+
+	if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
+		queue->write_space(sk);
+		goto out;
+	}
+
+	if (sk_stream_is_writeable(sk)) {
+		clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+		queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
+	}
+out:
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void nvmet_tcp_state_change(struct sock *sk)
+{
+	struct nvmet_tcp_queue *queue;
+
+	write_lock_bh(&sk->sk_callback_lock);
+	queue = sk->sk_user_data;
+	if (!queue)
+		goto done;
+
+	switch (sk->sk_state) {
+	case TCP_FIN_WAIT1:
+	case TCP_CLOSE_WAIT:
+	case TCP_CLOSE:
+		/* FALLTHRU */
+		sk->sk_user_data = NULL;
+		nvmet_tcp_schedule_release_queue(queue);
+		break;
+	default:
+		pr_warn("queue %d unhandled state %d\n",
+			queue->idx, sk->sk_state);
+	}
+done:
+	write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
+{
+	struct socket *sock = queue->sock;
+	struct linger sol = { .l_onoff = 1, .l_linger = 0 };
+	int ret;
+
+	ret = kernel_getsockname(sock,
+		(struct sockaddr *)&queue->sockaddr);
+	if (ret < 0)
+		return ret;
+
+	ret = kernel_getpeername(sock,
+		(struct sockaddr *)&queue->sockaddr_peer);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Cleanup whatever is sitting in the TCP transmit queue on socket
+	 * close. This is done to prevent stale data from being sent should
+	 * the network connection be restored before TCP times out.
+	 */
+	ret = kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
+			(char *)&sol, sizeof(sol));
+	if (ret)
+		return ret;
+
+	write_lock_bh(&sock->sk->sk_callback_lock);
+	sock->sk->sk_user_data = queue;
+	queue->data_ready = sock->sk->sk_data_ready;
+	sock->sk->sk_data_ready = nvmet_tcp_data_ready;
+	queue->state_change = sock->sk->sk_state_change;
+	sock->sk->sk_state_change = nvmet_tcp_state_change;
+	queue->write_space = sock->sk->sk_write_space;
+	sock->sk->sk_write_space = nvmet_tcp_write_space;
+	write_unlock_bh(&sock->sk->sk_callback_lock);
+
+	return 0;
+}
+
+static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
+		struct socket *newsock)
+{
+	struct nvmet_tcp_queue *queue;
+	int ret;
+
+	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
+	if (!queue)
+		return -ENOMEM;
+
+	INIT_WORK(&queue->release_work, nvmet_tcp_release_queue_work);
+	INIT_WORK(&queue->io_work, nvmet_tcp_io_work);
+	queue->sock = newsock;
+	queue->port = port;
+	queue->nr_cmds = 0;
+	spin_lock_init(&queue->state_lock);
+	queue->state = NVMET_TCP_Q_CONNECTING;
+	INIT_LIST_HEAD(&queue->free_list);
+	init_llist_head(&queue->resp_list);
+	INIT_LIST_HEAD(&queue->resp_send_list);
+
+	queue->idx = ida_simple_get(&nvmet_tcp_queue_ida, 0, 0, GFP_KERNEL);
+	if (queue->idx < 0) {
+		ret = queue->idx;
+		goto out_free_queue;
+	}
+
+	ret = nvmet_tcp_alloc_cmd(queue, &queue->connect);
+	if (ret)
+		goto out_ida_remove;
+
+	ret = nvmet_sq_init(&queue->nvme_sq);
+	if (ret)
+		goto out_free_connect;
+
+	port->last_cpu = cpumask_next_wrap(port->last_cpu,
+				cpu_online_mask, -1, false);
+	queue->cpu = port->last_cpu;
+	nvmet_prepare_receive_pdu(queue);
+
+	mutex_lock(&nvmet_tcp_queue_mutex);
+	list_add_tail(&queue->queue_list, &nvmet_tcp_queue_list);
+	mutex_unlock(&nvmet_tcp_queue_mutex);
+
+	ret = nvmet_tcp_set_queue_sock(queue);
+	if (ret)
+		goto out_destroy_sq;
+
+	queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
+
+	return 0;
+out_destroy_sq:
+	mutex_lock(&nvmet_tcp_queue_mutex);
+	list_del_init(&queue->queue_list);
+	mutex_unlock(&nvmet_tcp_queue_mutex);
+	nvmet_sq_destroy(&queue->nvme_sq);
+out_free_connect:
+	nvmet_tcp_free_cmd(&queue->connect);
+out_ida_remove:
+	ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx);
+out_free_queue:
+	kfree(queue);
+	return ret;
+}
+
+static void nvmet_tcp_accept_work(struct work_struct *w)
+{
+	struct nvmet_tcp_port *port =
+		container_of(w, struct nvmet_tcp_port, accept_work);
+	struct socket *newsock;
+	int ret;
+
+	while (true) {
+		ret = kernel_accept(port->sock, &newsock, O_NONBLOCK);
+		if (ret < 0) {
+			if (ret != -EAGAIN)
+				pr_warn("failed to accept err=%d\n", ret);
+			return;
+		}
+		ret = nvmet_tcp_alloc_queue(port, newsock);
+		if (ret) {
+			pr_err("failed to allocate queue\n");
+			sock_release(newsock);
+		}
+	}
+}
+
+static void nvmet_tcp_listen_data_ready(struct sock *sk)
+{
+	struct nvmet_tcp_port *port;
+
+	read_lock_bh(&sk->sk_callback_lock);
+	port = sk->sk_user_data;
+	if (!port)
+		goto out;
+
+	if (sk->sk_state == TCP_LISTEN)
+		schedule_work(&port->accept_work);
+out:
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static int nvmet_tcp_add_port(struct nvmet_port *nport)
+{
+	struct nvmet_tcp_port *port;
+	__kernel_sa_family_t af;
+	int opt, ret;
+
+	port = kzalloc(sizeof(*port), GFP_KERNEL);
+	if (!port)
+		return -ENOMEM;
+
+	switch (nport->disc_addr.adrfam) {
+	case NVMF_ADDR_FAMILY_IP4:
+		af = AF_INET;
+		break;
+	case NVMF_ADDR_FAMILY_IP6:
+		af = AF_INET6;
+		break;
+	default:
+		pr_err("address family %d not supported\n",
+				nport->disc_addr.adrfam);
+		ret = -EINVAL;
+		goto err_port;
+	}
+
+	ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr,
+			nport->disc_addr.trsvcid, &port->addr);
/*
 *  linux/mm/page_alloc.c
 *
 *  Manages the free list, the system allocates free pages here.
 *  Note that kmalloc() lives in slab.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *  Swap reorganised 29.12.95, Stephen Tweedie
 *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
 *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
 *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
 *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
 *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
 *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
 */
#include <linux/config.h>
#include <linux/stddef.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/interrupt.h>
#include <linux/pagemap.h>
#include <linux/bootmem.h>
#include <linux/compiler.h>
#include <linux/module.h>
#include <linux/suspend.h>
#include <linux/pagevec.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/notifier.h>
#include <linux/topology.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/nodemask.h>
#include <linux/vmalloc.h>
#include <asm/tlbflush.h>
#include "internal.h"
/*
 * MCD - HACK: Find somewhere to initialize this EARLY, or make this
 * initializer cleaner
 */
nodemask_t node_online_map = { { [0] = 1UL } };
EXPORT_SYMBOL(node_online_map);
nodemask_t node_possible_map = NODE_MASK_ALL;
EXPORT_SYMBOL(node_possible_map);
struct pglist_data *pgdat_list;
unsigned long totalram_pages;
unsigned long totalhigh_pages;
long nr_swap_pages;
/*
 * results with 256, 32 in the lowmem_reserve sysctl:
 *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
 *	1G machine -> (16M dma, 784M normal, 224M high)
 *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
 *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
 *	HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
 */
int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 };
EXPORT_SYMBOL(totalram_pages);
EXPORT_SYMBOL(nr_swap_pages);
/*
 * Used by page_zone() to look up the address of the struct zone whose
 * id is encoded in the upper bits of page->flags
 */
struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)];
EXPORT_SYMBOL(zone_table);
static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
int min_free_kbytes = 1024;
unsigned long __initdata nr_kernel_pages;
unsigned long __initdata nr_all_pages;
/*
 * Temporary debugging check for pages not lying within a given zone.
 */
static int bad_range(struct zone *zone, struct page *page)
{
	if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages)
		return 1;
	if (page_to_pfn(page) < zone->zone_start_pfn)
		return 1;
#ifdef CONFIG_HOLES_IN_ZONE
	if (!pfn_valid(page_to_pfn(page)))
		return 1;
#endif
	if (zone != page_zone(page))
		return 1;
	return 0;
}
static void bad_page(const char *function, struct page *page)
{
	printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
		function, current->comm, page);
	printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
		(int)(2*sizeof(page_flags_t)), (unsigned long)page->flags,
		page->mapping, page_mapcount(page), page_count(page));
	printk(KERN_EMERG "Backtrace:\n");
	dump_stack();
	printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
	page->flags &= ~(1 << PG_private	|
			1 << PG_locked	|
			1 << PG_lru	|
			1 << PG_active	|
			1 << PG_dirty	|
			1 << PG_swapcache |
			1 << PG_writeback);
	set_page_count(page, 0);
	reset_page_mapcount(page);
	page->mapping = NULL;
	tainted |= TAINT_BAD_PAGE;
}
#ifndef CONFIG_HUGETLB_PAGE
#define prep_compound_page(page, order) do { } while (0)
#define destroy_compound_page(page, order) do { } while (0)
#else
/*
 * Higher-order pages are called "compound pages".  They are structured thusly:
 *
 * The first PAGE_SIZE page is called the "head page".
 *
 * The remaining PAGE_SIZE pages are called "tail pages".
 *
 * All pages have PG_compound set.  All pages have their ->private pointing at
 * the head page (even the head page has this).
 *
 * The first tail page's ->mapping, if non-zero, holds the address of the
 * compound page's put_page() function.
 *
 * The order of the allocation is stored in the first tail page's ->index
 * This is only for debug at present.  This usage means that zero-order pages
 * may not be compound.
 */
static void prep_compound_page(struct page *page, unsigned long order)
{
	int i;
	int nr_pages = 1 << order;
	page[1].mapping = NULL;
	page[1].index = order;
	for (i = 0; i < nr_pages; i++) {
		struct page *p = page + i;
		SetPageCompound(p);
		p->private = (unsigned long)page;
	}
}
static void destroy_compound_page(struct page *page, unsigned long order)
{
	int i;
	int nr_pages = 1 << order;
	if (!PageCompound(page))
		return;
	if (page[1].index != order)
		bad_page(__FUNCTION__, page);
	for (i = 0; i < nr_pages; i++) {
		struct page *p = page + i;
		if (!PageCompound(p))
			bad_page(__FUNCTION__, page);
		if (p->private != (unsigned long)page)
			bad_page(__FUNCTION__, page);
		ClearPageCompound(p);
	}
}
#endif		/* CONFIG_HUGETLB_PAGE */
/*
 * function for dealing with page's order in buddy system.
 * zone->lock is already acquired when we use these.
 * So, we don't need atomic page->flags operations here.
 */
static inline unsigned long page_order(struct page *page) {
	return page->private;
}
static inline void set_page_order(struct page *page, int order) {
	page->private = order;
	__SetPagePrivate(page);
}
static inline void rmv_page_order(struct page *page)
{
	__ClearPagePrivate(page);
	page->private = 0;
}
/*
 * Locate the struct page for both the matching buddy in our
 * pair (buddy1) and the combined O(n+1) page they form (page).
 *
 * 1) Any buddy B1 will have an order O twin B2 which satisfies
 * the following equation:
 *     B2 = B1 ^ (1 << O)
 * For example, if the starting buddy (buddy2) is #8 its order
 * 1 buddy is #10:
 *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
 *
 * 2) Any buddy B will have an order O+1 parent P which
 * satisfies the following equation:
 *     P = B & ~(1 << O)
 *
 * Assumption: *_mem_map is contigious at least up to MAX_ORDER
 */
static inline struct page *
__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
{
	unsigned long buddy_idx = page_idx ^ (1 << order);
	return page + (buddy_idx - page_idx);
}
static inline unsigned long
__find_combined_index(unsigned long page_idx, unsigned int order)
{
	return (page_idx & ~(1 << order));
}
/*
 * This function checks whether a page is free && is the buddy
 * we can do coalesce a page and its buddy if
 * (a) the buddy is free &&
 * (b) the buddy is on the buddy system &&
 * (c) a page and its buddy have the same order.
 * for recording page's order, we use page->private and PG_private.
 *
 */
static inline int page_is_buddy(struct page *page, int order)
{
       if (PagePrivate(page)           &&
           (page_order(page) == order) &&
           !PageReserved(page)         &&
            page_count(page) == 0)
               return 1;
       return 0;
}
/*
 * Freeing function for a buddy system allocator.
 *
 * The concept of a buddy system is to maintain direct-mapped table
 * (containing bit values) for memory blocks of various "orders".
 * The bottom level table contains the map for the smallest allocatable
 * units of memory (here, pages), and each level above it describes
 * pairs of units from the levels below, hence, "buddies".
 * At a high level, all that happens here is marking the table entry
 * at the bottom level available, and propagating the changes upward
 * as necessary, plus some accounting needed to play nicely with other
 * parts of the VM system.
 * At each level, we keep a list of pages, which are heads of continuous
 * free pages of length of (1 << order) and marked with PG_Private.Page's
 * order is recorded in page->private field.
 * So when we are allocating or freeing one, we can derive the state of the
 * other.  That is, if we allocate a small block, and both were   
 * free, the remainder of the region must be split into blocks.   
 * If a block is freed, and its buddy is also free, then this
 * triggers coalescing into a block of larger size.            
 *
 * -- wli
 */
static inline void __free_pages_bulk (struct page *page,
		struct zone *zone, unsigned int order)
{
	unsigned long page_idx;
	int order_size = 1 << order;
	if (unlikely(order))
		destroy_compound_page(page, order);
	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
	BUG_ON(page_idx & (order_size - 1));
	BUG_ON(bad_range(zone, page));
	zone->free_pages += order_size;
	while (order < MAX_ORDER-1) {
		unsigned long combined_idx;
		struct free_area *area;
		struct page *buddy;
		combined_idx = __find_combined_index(page_idx, order);
		buddy = __page_find_buddy(page, page_idx, order);
		if (bad_range(zone, buddy))
			break;
		if (!page_is_buddy(buddy, order))
			break;		/* Move the buddy up one level. */
		list_del(&buddy->lru);
		area = zone->free_area + order;
		area->nr_free--;
		rmv_page_order(buddy);
		page = page + (combined_idx - page_idx);
		page_idx = combined_idx;
		order++;
	}
	set_page_order(page, order);
	list_add(&page->lru, &zone->free_area[order].free_list);
	zone->free_area[order].nr_free++;
}
static inline void free_pages_check(const char *function, struct page *page)
{
	if (	page_mapcount(page) ||
		page->mapping != NULL ||
		page_count(page) != 0 ||
		(page->flags & (
			1 << PG_lru	|
			1 << PG_private |
			1 << PG_locked	|
			1 << PG_active	|
			1 << PG_reclaim	|
			1 << PG_slab	|
			1 << PG_swapcache |
			1 << PG_writeback )))
		bad_page(function, page);
	if (PageDirty(page))
		ClearPageDirty(page);
}
/*
 * Frees a list of pages. 
 * Assumes all pages on list are in same zone, and of same order.
 * count is the number of pages to free, or 0 for all on the list.
 *
 * If the zone was previously in an "all pages pinned" state then look to
 * see if this freeing clears that state.
 *
 * And clear the zone's pages_scanned counter, to hold off the "all pages are
 * pinned" detection logic.
 */
static int
free_pages_bulk(struct zone *zone, int count,
		struct list_head *list, unsigned int order)
{
	unsigned long flags;
	struct page *page = NULL;
	int ret = 0;
	spin_lock_irqsave(&zone->lock, flags);
	zone->all_unreclaimable = 0;
	zone->pages_scanned = 0;
	while (!list_empty(list) && count--) {
		page = list_entry(list->prev, struct page, lru);
		/* have to delete it as __free_pages_bulk list manipulates */
		list_del(&page->lru);
		__free_pages_bulk(page, zone, order);
		ret++;
	}
	spin_unlock_irqrestore(&zone->lock, flags);
	return ret;
}
void __free_pages_ok(struct page *page, unsigned int order)
{
	LIST_HEAD(list);
	int i;
	arch_free_page(page, order);
	mod_page_state(pgfree, 1 << order);
#ifndef CONFIG_MMU
	if (order > 0)
		for (i = 1 ; i < (1 << order) ; ++i)
			__put_page(page + i);
#endif
	for (i = 0 ; i < (1 << order) ; ++i)
		free_pages_check(__FUNCTION__, page + i);
	list_add(&page->lru, &list);
	kernel_map_pages(page, 1<<order, 0);
	free_pages_bulk(page_zone(page), 1, &list, order);
}
/*
 * The order of subdivision here is critical for the IO subsystem.
 * Please do not alter this order without good reasons and regression
 * testing. Specifically, as large blocks of memory are subdivided,
 * the order in which smaller blocks are delivered depends on the order
 * they're subdivided in this function. This is the primary factor
 * influencing the order in which pages are delivered to the IO
 * subsystem according to empirical testing, and this is also justified
 * by considering the behavior of a buddy system containing a single
 * large block of memory acted on by a series of small allocations.
 * This behavior is a critical factor in sglist merging's success.
 *
 * -- wli
 */
static inline struct page *
expand(struct zone *zone, struct page *page,
 	int low, int high, struct free_area *area)
{
	unsigned long size = 1 << high;
	while (high > low) {
		area--;
		high--;
		size >>= 1;
		BUG_ON(bad_range(zone, &page[size]));
		list_add(&page[size].lru, &area->free_list);
		area->nr_free++;
		set_page_order(&page[size], high);
	}
	return page;
}
void set_page_refs(struct page *page, int order)
{
#ifdef CONFIG_MMU
	set_page_count(page, 1);
#else
	int i;
	/*
	 * We need to reference all the pages for this order, otherwise if
	 * anyone accesses one of the pages with (get/put) it will be freed.
	 * - eg: access_process_vm()
	 */
	for (i = 0; i < (1 << order); i++)
		set_page_count(page + i, 1);
#endif /* CONFIG_MMU */
}
/*
 * This page is about to be returned from the page allocator
 */
static void prep_new_page(struct page *page, int order)
{
	if (page->mapping || page_mapcount(page) ||
	    (page->flags & (
			1 << PG_private	|
			1 << PG_locked	|
			1 << PG_lru	|
			1 << PG_active	|
			1 << PG_dirty	|
			1 << PG_reclaim	|
			1 << PG_swapcache |
			1 << PG_writeback )))
		bad_page(__FUNCTION__, page);
	page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
			1 << PG_referenced | 1 << PG_arch_1 |
			1 << PG_checked | 1 << PG_mappedtodisk);
	page->private = 0;
	set_page_refs(page, order);
	kernel_map_pages(page, 1 << order, 1);
}
/* 
 * Do the hard work of removing an element from the buddy allocator.
 * Call me with the zone->lock already held.
 */
static struct page *__rmqueue(struct zone *zone, unsigned int order)
{
	struct free_area * area;
	unsigned int current_order;
	struct page *page;
	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
		area = zone->free_area + current_order;
		if (list_empty(&area->free_list))
			continue;
		page = list_entry(area->free_list.next, struct page, lru);
		list_del(&page->lru);
		rmv_page_order(page);
		area->nr_free--;
		zone->free_pages -= 1UL << order;
		return expand(zone, page, order, current_order, area);
	}
	return NULL;
}
/* 
 * Obtain a specified number of elements from the buddy allocator, all under
 * a single hold of the lock, for efficiency.  Add them to the supplied list.
 * Returns the number of new pages which were placed at *list.
 */
static int rmqueue_bulk(struct zone *zone, unsigned int order, 
			unsigned long count, struct list_head *list)
{
	unsigned long flags;
	int i;
	int allocated = 0;
	struct page *page;
	
	spin_lock_irqsave(&zone->lock, flags);
	for (i = 0; i < count; ++i) {
		page = __rmqueue(zone, order);
		if (page == NULL)
			break;
		allocated++;
		list_add_tail(&page->lru, list);
	}
	spin_unlock_irqrestore(&zone->lock, flags);
	return allocated;
}
#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
static void __drain_pages(unsigned int cpu)
{
	struct zone *zone;
	int i;
	for_each_zone(zone) {
		struct per_cpu_pageset *pset;
		pset = &zone->pageset[cpu];
		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
			struct per_cpu_pages *pcp;
			pcp = &pset->pcp[i];
			pcp->count -= free_pages_bulk(zone, pcp->count,
						&pcp->list, 0);
		}
	}
}
#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
#ifdef CONFIG_PM
void mark_free_pages(struct zone *zone)
{
	unsigned long zone_pfn, flags;
	int order;
	struct list_head *curr;
	if (!zone->spanned_pages)
		return;
	spin_lock_irqsave(&zone->lock, flags);
	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
		ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn));
	for (order = MAX_ORDER - 1; order >= 0; --order)
		list_for_each(curr, &zone->free_area[order].free_list) {
			unsigned long start_pfn, i;
			start_pfn = page_to_pfn(list_entry(curr, struct page, lru));
			for (i=0; i < (1<<order); i++)
				SetPageNosaveFree(pfn_to_page(start_pfn+i));
	}
	spin_unlock_irqrestore(&zone->lock, flags);
}
/*
 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
 */
void drain_local_pages(void)
{
	unsigned long flags;
	local_irq_save(flags);	
	__drain_pages(smp_processor_id());
	local_irq_restore(flags);	
}
#endif /* CONFIG_PM */
static void zone_statistics(struct zonelist *zonelist, struct zone *z)
{
#ifdef CONFIG_NUMA
	unsigned long flags;
	int cpu;
	pg_data_t *pg = z->zone_pgdat;
	pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
	struct per_cpu_pageset *p;
	local_irq_save(flags);
	cpu = smp_processor_id();
	p = &z->pageset[cpu];
	if (pg == orig) {
		z->pageset[cpu].numa_hit++;
	} else {
		p->numa_miss++;
		zonelist->zones[0]->pageset[cpu].numa_foreign++;
	}
	if (pg == NODE_DATA(numa_node_id()))
		p->local_node++;
	else
		p->other_node++;
	local_irq_restore(flags);
#endif
}
/*
 * Free a 0-order page
 */
static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
static void fastcall free_hot_cold_page(struct page *page, int cold)
{
	struct zone *zone = page_zone(page);
	struct per_cpu_pages *pcp;
	unsigned long flags;
	arch_free_page(page, 0);
	kernel_map_pages(page, 1, 0);
	inc_page_state(pgfree);
	if (PageAnon(page))
		page->mapping = NULL;
	free_pages_check(__FUNCTION__, page);
	pcp = &zone->pageset[get_cpu()].pcp[cold];
	local_irq_save(flags);
	if (pcp->count >= pcp->high)
		pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
	list_add(&page->lru, &pcp->list);
	pcp->count++;
	local_irq_restore(flags);
	put_cpu();
}
void fastcall free_hot_page(struct page *page)
{
	free_hot_cold_page(page, 0);
}
	
void fastcall free_cold_page(struct page *page)
{
	free_hot_cold_page(page, 1);
}
static inline void prep_zero_page(struct page *page, int order, unsigned int __nocast gfp_flags)
{
	int i;
	BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
	for(i = 0; i < (1 << order); i++)
		clear_highpage(page + i);
}
/*
 * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
 * we cheat by calling it from here, in the order > 0 path.  Saves a branch
 * or two.
 */
static struct page *
buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags)
{
	unsigned long flags;
	struct page *page = NULL;
	int cold = !!(gfp_flags & __GFP_COLD);
	if (order == 0) {
		struct per_cpu_pages *pcp;
		pcp = &zone->pageset[get_cpu()].pcp[cold];
		local_irq_save(flags);
		if (pcp->count <= pcp->low)
			pcp->count += rmqueue_bulk(zone, 0,
						pcp->batch, &pcp->list);
		if (pcp->count) {
			page = list_entry(pcp->list.next, struct page, lru);
			list_del(&page->lru);
			pcp->count--;
		}
		local_irq_restore(flags);
		put_cpu();
	}
	if (page == NULL) {
		spin_lock_irqsave(&zone->lock, flags);
		page = __rmqueue(zone, order);
		spin_unlock_irqrestore(&zone->lock, flags);
	}
	if (page != NULL) {
		BUG_ON(bad_range(zone, page));
		mod_page_state_zone(zone, pgalloc, 1 << order);
		prep_new_page(page, order);
		if (gfp_flags & __GFP_ZERO)
			prep_zero_page(page, order, gfp_flags);
		if (order && (gfp_flags & __GFP_COMP))
			prep_compound_page(page, order);
	}
	return page;
}
/*
 * Return 1 if free pages are above 'mark'. This takes into account the order
 * of the allocation.
 */
int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
		      int classzone_idx, int can_try_harder, int gfp_high)
{
	/* free_pages my go negative - that's OK */
	long min = mark, free_pages = z->free_pages - (1 << order) + 1;
	int o;
	if (gfp_high)
		min -= min / 2;
	if (can_try_harder)
		min -= min / 4;
	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
		return 0;
	for (o = 0; o < order; o++) {
		/* At the next order, this order's pages become unavailable */
		free_pages -= z->free_area[o].nr_free << o;
		/* Require fewer higher order pages to be free */
		min >>= 1;
		if (free_pages <= min)
			return 0;
	}
	return 1;
}
/*
 * This is the 'heart' of the zoned buddy allocator.
 */
struct page * fastcall
__alloc_pages(unsigned int __nocast gfp_mask, unsigned int order,
		struct zonelist *zonelist)
{
	const int wait = gfp_mask & __GFP_WAIT;
	struct zone **zones, *z;
	struct page *page;
	struct reclaim_state reclaim_state;
	struct task_struct *p = current;
	int i;
	int classzone_idx;
	int do_retry;
	int can_try_harder;
	int did_some_progress;
	might_sleep_if(wait);
	/*
	 * The caller may dip into page reserves a bit more if the caller
	 * cannot run direct reclaim, or is the caller has realtime scheduling
	 * policy
	 */
	can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait;
	zones = zonelist->zones;  /* the list of zones suitable for gfp_mask */
	if (unlikely(zones[0] == NULL)) {
		/* Should this ever happen?? */
		return NULL;
	}
	classzone_idx = zone_idx(zones[0]);
 restart:
	/* Go through the zonelist once, looking for a zone with enough free */
	for (i = 0; (z = zones[i]) != NULL; i++) {
		if (!zone_watermark_ok(z, order, z->pages_low,
				       classzone_idx, 0, 0))
			continue;
		if (!cpuset_zone_allowed(z))
			continue;
		page = buffered_rmqueue(z, order, gfp_mask);
		if (page)
			goto got_pg;
	}
	for (i = 0; (z = zones[i]) != NULL; i++)
		wakeup_kswapd(z, order);
	/*
	 * Go through the zonelist again. Let __GFP_HIGH and allocations
	 * coming from realtime tasks to go deeper into reserves
	 *
	 * This is the last chance, in general, before the goto nopage.
	 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
	 */
	for (i = 0; (z = zones[i]) != NULL; i++) {
		if (!zone_watermark_ok(z, order, z->pages_min,
				       classzone_idx, can_try_harder,
				       gfp_mask & __GFP_HIGH))
			continue;
		if (wait && !cpuset_zone_allowed(z))
			continue;
		page = buffered_rmqueue(z, order, gfp_mask);
		if (page)
			goto got_pg;
	}
	/* This allocation should allow future memory freeing. */
	if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
			&& !in_interrupt()) {
		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
			/* go through the zonelist yet again, ignoring mins */
			for (i = 0; (z = zones[i]) != NULL; i++) {
				if (!cpuset_zone_allowed(z))
					continue;
				page = buffered_rmqueue(z, order, gfp_mask);
				if (page)
					goto got_pg;
			}
		}
		goto nopage;
	}
	/* Atomic allocations - we can't balance anything */
	if (!wait)
		goto nopage;
rebalance:
	cond_resched();
	/* We now go into synchronous reclaim */
	p->flags |= PF_MEMALLOC;
	reclaim_state.reclaimed_slab = 0;
	p->reclaim_state = &reclaim_state;
	did_some_progress = try_to_free_pages(zones, gfp_mask, order);
	p->reclaim_state = NULL;
	p->flags &= ~PF_MEMALLOC;
	cond_resched();
	if (likely(did_some_progress)) {
		/*
		 * Go through the zonelist yet one more time, keep
		 * very high watermark here, this is only to catch
		 * a parallel oom killing, we must fail if we're still
		 * under heavy pressure.
		 */
		for (i = 0; (z = zones[i]) != NULL; i++) {
			if (!zone_watermark_ok(z, order, z->pages_min,
					       classzone_idx, can_try_harder,
					       gfp_mask & __GFP_HIGH))
				continue;
			if (!cpuset_zone_allowed(z))
				continue;
			page = buffered_rmqueue(z, order, gfp_mask);
			if (page)
				goto got_pg;
		}
	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
		/*
		 * Go through the zonelist yet one more time, keep
		 * very high watermark here, this is only to catch
		 * a parallel oom killing, we must fail if we're still
		 * under heavy pressure.
		 */
		for (i = 0; (z = zones[i]) != NULL; i++) {
			if (!zone_watermark_ok(z, order, z->pages_high,
					       classzone_idx, 0, 0))
				continue;
			if (!cpuset_zone_allowed(z))
				continue;
			page = buffered_rmqueue(z, order, gfp_mask);
			if (page)
				goto got_pg;
		}
		out_of_memory(gfp_mask);
		goto restart;
	}
	/*
	 * Don't let big-order allocations loop unless the caller explicitly
	 * requests that.  Wait for some write requests to complete then retry.
	 *
	 * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
	 * <= 3, but that may not be true in other implementations.
	 */
	do_retry = 0;
	if (!(gfp_mask & __GFP_NORETRY)) {
		if ((order <= 3) || (gfp_mask & __GFP_REPEAT))
			do_retry = 1;
		if (gfp_mask & __GFP_NOFAIL)
			do_retry = 1;
	}
	if (do_retry) {
		blk_congestion_wait(WRITE, HZ/50);
		goto rebalance;
	}
nopage:
	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
		printk(KERN_WARNING "%s: page allocation failure."
			" order:%d, mode:0x%x\n",
			p->comm, order, gfp_mask);
		dump_stack();
	}
	return NULL;
got_pg:
	zone_statistics(zonelist, z);
	return page;
}
EXPORT_SYMBOL(__alloc_pages);
/*
 * Common helper functions.
 */
fastcall unsigned long __get_free_pages(unsigned int __nocast gfp_mask, unsigned int order)
{
	struct page * page;
	page = alloc_pages(gfp_mask, order);
	if (!page)
		return 0;
	return (unsigned long) page_address(page);
}
EXPORT_SYMBOL(__get_free_pages);
fastcall unsigned long get_zeroed_page(unsigned int __nocast gfp_mask)
{
	struct page * page;
	/*
	 * get_zeroed_page() returns a 32-bit address, which cannot represent
	 * a highmem page
	 */
	BUG_ON(gfp_mask & __GFP_HIGHMEM);
	page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
	if (page)
		return (unsigned long) page_address(page);
	return 0;
}
EXPORT_SYMBOL(get_zeroed_page);
void __pagevec_free(struct pagevec *pvec)
{
	int i = pagevec_count(pvec);
	while (--i >= 0)
		free_hot_cold_page(pvec->pages[i], pvec->cold);
}
fastcall void __free_pages(struct page *page, unsigned int order)
{
	if (!PageReserved(page) && put_page_testzero(page)) {
		if (order == 0)
			free_hot_page(page);
		else
			__free_pages_ok(page, order);
	}
}
EXPORT_SYMBOL(__free_pages);
fastcall void free_pages(unsigned long addr, unsigned int order)
{
	if (addr != 0) {
		BUG_ON(!virt_addr_valid((void *)addr));
		__free_pages(virt_to_page((void *)addr), order);
	}
}
EXPORT_SYMBOL(free_pages);
/*
 * Total amount of free (allocatable) RAM:
 */
unsigned int nr_free_pages(void)
{
	unsigned int sum = 0;
	struct zone *zone;
	for_each_zone(zone)
		sum += zone->free_pages;
	return sum;
}
EXPORT_SYMBOL(nr_free_pages);
#ifdef CONFIG_NUMA
unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
{
	unsigned int i, sum = 0;
	for (i = 0; i < MAX_NR_ZONES; i++)
		sum += pgdat->node_zones[i].free_pages;
	return sum;
}
#endif
static unsigned int nr_free_zone_pages(int offset)
{
	pg_data_t *pgdat;
	unsigned int sum = 0;
	for_each_pgdat(pgdat) {
		struct zonelist *zonelist = pgdat->node_zonelists + offset;
		struct zone **zonep = zonelist->zones;
		struct zone *zone;
		for (zone = *zonep++; zone; zone = *zonep++) {
			unsigned long size = zone->present_pages;
			unsigned long high = zone->pages_high;
			if (size > high)
				sum += size - high;
		}
	}
	return sum;
}
/*
 * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
 */
unsigned int nr_free_buffer_pages(void)
{
	return nr_free_zone_pages(GFP_USER & GFP_ZONEMASK);
}
/*
 * Amount of free RAM allocatable within all zones
 */
unsigned int nr_free_pagecache_pages(void)
{
	return nr_free_zone_pages(GFP_HIGHUSER & GFP_ZONEMASK);
}
#ifdef CONFIG_HIGHMEM
unsigned int nr_free_highpages (void)
{
	pg_data_t *pgdat;
	unsigned int pages = 0;
	for_each_pgdat(pgdat)
		pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
	return pages;
}
#endif
#ifdef CONFIG_NUMA
static void show_node(struct zone *zone)
{
	printk("Node %d ", zone->zone_pgdat->node_id);
}
#else
#define show_node(zone)	do { } while (0)
#endif
/*
 * Accumulate the page_state information across all CPUs.
 * The result is unavoidably approximate - it can change
 * during and after execution of this function.
 */
static DEFINE_PER_CPU(struct page_state, page_states) = {0};
atomic_t nr_pagecache = ATOMIC_INIT(0);
EXPORT_SYMBOL(nr_pagecache);
#ifdef CONFIG_SMP
DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
#endif
void __get_page_state(struct page_state *ret, int nr)
{
	int cpu = 0;
	memset(ret, 0, sizeof(*ret));
	cpu = first_cpu(cpu_online_map);
	while (cpu < NR_CPUS) {
		unsigned long *in, *out, off;
		in = (unsigned long *)&per_cpu(page_states, cpu);
		cpu = next_cpu(cpu, cpu_online_map);
		if (cpu < NR_CPUS)
			prefetch(&per_cpu(page_states, cpu));
		out = (unsigned long *)ret;
		for (off = 0; off < nr; off++)
			*out++ += *in++;
	}
}
void get_page_state(struct page_state *ret)
{
	int nr;
	nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
	nr /= sizeof(unsigned long);
	__get_page_state(ret, nr + 1);
}
void get_full_page_state(struct page_state *ret)
{
	__get_page_state(ret, sizeof(*ret) / sizeof(unsigned long));
}
unsigned long __read_page_state(unsigned offset)
{
	unsigned long ret = 0;
	int cpu;
	for_each_online_cpu(cpu) {
		unsigned long in;
		in = (unsigned long)&per_cpu(page_states, cpu) + offset;
		ret += *((unsigned long *)in);
	}
	return ret;
}
void __mod_page_state(unsigned offset, unsigned long delta)
{
	unsigned long flags;
	void* ptr;
	local_irq_save(flags);
	ptr = &__get_cpu_var(page_states);
	*(unsigned long*)(ptr + offset) += delta;
	local_irq_restore(flags);
}
EXPORT_SYMBOL(__mod_page_state);
void __get_zone_counts(unsigned long *active, unsigned long *inactive,
			unsigned long *free, struct pglist_data *pgdat)
{
	struct zone *zones = pgdat->node_zones;
	int i;
	*active = 0;
	*inactive = 0;
	*free = 0;
	for (i = 0; i < MAX_NR_ZONES; i++) {
		*active += zones[i].nr_active;
		*inactive += zones[i].nr_inactive;
		*free += zones[i].free_pages;
	}
}
void get_zone_counts(unsigned long *active,
		unsigned long *inactive, unsigned long *free)
{
	struct pglist_data *pgdat;
	*active = 0;
	*inactive = 0;
	*free = 0;
	for_each_pgdat(pgdat) {
		unsigned long l, m, n;
		__get_zone_counts(&l, &m, &n, pgdat);
		*active += l;
		*inactive += m;
		*free += n;
	}
}
void si_meminfo(struct sysinfo *val)
{
	val->totalram = totalram_pages;
	val->sharedram = 0;
	val->freeram = nr_free_pages();
	val->bufferram = nr_blockdev_pages();
#ifdef CONFIG_HIGHMEM
	val->totalhigh = totalhigh_pages;
	val->freehigh = nr_free_highpages();
#else
	val->totalhigh = 0;
	val->freehigh = 0;
#endif
	val->mem_unit = PAGE_SIZE;
}
EXPORT_SYMBOL(si_meminfo);
#ifdef CONFIG_NUMA
void si_meminfo_node(struct sysinfo *val, int nid)
{
	pg_data_t *pgdat = NODE_DATA(nid);
	val->totalram = pgdat->node_present_pages;
	val->freeram = nr_free_pages_pgdat(pgdat);
	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
	val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;
	val->mem_unit = PAGE_SIZE;
}
#endif
#define K(x) ((x) << (PAGE_SHIFT-10))
/*
 * Show free area list (used inside shift_scroll-lock stuff)
 * We also calculate the percentage fragmentation. We do this by counting the
 * memory on each free list with the exception of the first item on the list.
 */
void show_free_areas(void)
{
	struct page_state ps;
	int cpu, temperature;
	unsigned long active;
	unsigned long inactive;
	unsigned long free;
	struct zone *zone;
	for_each_zone(zone) {
		show_node(zone);
		printk("%s per-cpu:", zone->name);
		if (!zone->present_pages) {
			printk(" empty\n");
			continue;
		} else
			printk("\n");
		for (cpu = 0; cpu < NR_CPUS; ++cpu) {
			struct per_cpu_pageset *pageset;
			if (!cpu_possible(cpu))
				continue;
			pageset = zone->pageset + cpu;
			for (temperature = 0; temperature < 2; temperature++)
				printk("cpu %d %s: low %d, high %d, batch %d\n",
					cpu,
					temperature ? "cold" : "hot",
					pageset->pcp[temperature].low,
					pageset->pcp[temperature].high,
					pageset->pcp[temperature].batch);
		}
	}
	get_page_state(&ps);
	get_zone_counts(&active, &inactive, &free);
	printk("\nFree pages: %11ukB (%ukB HighMem)\n",
		K(nr_free_pages()),
		K(nr_free_highpages()));
	printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
		"unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
		active,
		inactive,
		ps.nr_dirty,
		ps.nr_writeback,
		ps.nr_unstable,
		nr_free_pages(),
		ps.nr_slab,
		ps.nr_mapped,
		ps.nr_page_table_pages);
	for_each_zone(zone) {
		int i;
		show_node(zone);
		printk("%s"
			" free:%lukB"
			" min:%lukB"
			" low:%lukB"
			" high:%lukB"
			" active:%lukB"
			" inactive:%lukB"
			" present:%lukB"
			" pages_scanned:%lu"
			" all_unreclaimable? %s"
			"\n",
			zone->name,
			K(zone->free_pages),
			K(zone->pages_min),
			K(zone->pages_low),
			K(zone->pages_high),
			K(zone->nr_active),
			K(zone->nr_inactive),
			K(zone->present_pages),
			zone->pages_scanned,
			(zone->all_unreclaimable ? "yes" : "no")
			);
		printk("lowmem_reserve[]:");
		for (i = 0; i < MAX_NR_ZONES; i++)
			printk(" %lu", zone->lowmem_reserve[i]);
		printk("\n");
	}
	for_each_zone(zone) {
 		unsigned long nr, flags, order, total = 0;
		show_node(zone);
		printk("%s: ", zone->name);
		if (!zone->present_pages) {
			printk("empty\n");
			continue;
		}
		spin_lock_irqsave(&zone->lock, flags);
		for (order = 0; order < MAX_ORDER; order++) {
			nr = zone->free_area[order].nr_free;
			total += nr << order;
			printk("%lu*%lukB ", nr, K(1UL) << order);
		}
		spin_unlock_irqrestore(&zone->lock, flags);
		printk("= %lukB\n", K(total));
	}
	show_swap_cache_info();
}
/*
 * Builds allocation fallback zone lists.
 */
static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
{
	switch (k) {
		struct zone *zone;
	default:
		BUG();
	case ZONE_HIGHMEM:
		zone = pgdat->node_zones + ZONE_HIGHMEM;
		if (zone->present_pages) {
#ifndef CONFIG_HIGHMEM
			BUG();
#endif
			zonelist->zones[j++] = zone;
		}
	case ZONE_NORMAL:
		zone = pgdat->node_zones + ZONE_NORMAL;
		if (zone->present_pages)
			zonelist->zones[j++] = zone;
	case ZONE_DMA:
		zone = pgdat->node_zones + ZONE_DMA;
		if (zone->present_pages)
			zonelist->zones[j++] = zone;
	}
	return j;
}
#ifdef CONFIG_NUMA
#define MAX_NODE_LOAD (num_online_nodes())
static int __initdata node_load[MAX_NUMNODES];
/**
 * find_next_best_node - find the next node that should appear in a given node's fallback list
 * @node: node whose fallback list we're appending
 * @used_node_mask: nodemask_t of already used nodes
 *
 * We use a number of factors to determine which is the next node that should
 * appear on a given node's fallback list.  The node should not have appeared
 * already in @node's fallback list, and it should be the next closest node
 * according to the distance array (which contains arbitrary distance values
 * from each node to each node in the system), and should also prefer nodes
 * with no CPUs, since presumably they'll have very little allocation pressure
 * on them otherwise.
 * It returns -1 if no node is found.
 */
static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
{
	int i, n, val;
	int min_val = INT_MAX;
	int best_node = -1;
	for_each_online_node(i) {
		cpumask_t tmp;
		/* Start from local node */
		n = (node+i) % num_online_nodes();
		/* Don't want a node to appear more than once */
		if (node_isset(n, *used_node_mask))
			continue;
		/* Use the local node if we haven't already */
		if (!node_isset(node, *used_node_mask)) {
			best_node = node;
			break;
		}
		/* Use the distance array to find the distance */
		val = node_distance(node, n);
		/* Give preference to headless and unused nodes */
		tmp = node_to_cpumask(n);
		if (!cpus_empty(tmp))
			val += PENALTY_FOR_NODE_WITH_CPUS;
		/* Slight preference for less loaded node */
		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
		val += node_load[n];
		if (val < min_val) {
			min_val = val;
			best_node = n;
		}
	}
	if (best_node >= 0)
		node_set(best_node, *used_node_mask);
	return best_node;
}
static void __init build_zonelists(pg_data_t *pgdat)
{
	int i, j, k, node, local_node;
	int prev_node, load;
	struct zonelist *zonelist;
	nodemask_t used_mask;
	/* initialize zonelists */
	for (i = 0; i < GFP_ZONETYPES; i++) {
		zonelist = pgdat->node_zonelists + i;
		zonelist->zones[0] = NULL;
	}
	/* NUMA-aware ordering of nodes */
	local_node = pgdat->node_id;
	load = num_online_nodes();
	prev_node = local_node;
	nodes_clear(used_mask);
	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
		/*
		 * We don't want to pressure a particular node.
		 * So adding penalty to the first node in same
		 * distance group to make it round-robin.
		 */
		if (node_distance(local_node, node) !=
				node_distance(local_node, prev_node))
			node_load[node] += load;
		prev_node = node;
		load--;
		for (i = 0; i < GFP_ZONETYPES; i++) {
			zonelist = pgdat->node_zonelists + i;
			for (j = 0; zonelist->zones[j] != NULL; j++);
			k = ZONE_NORMAL;
			if (i & __GFP_HIGHMEM)
				k = ZONE_HIGHMEM;
			if (i & __GFP_DMA)
				k = ZONE_DMA;
	 		j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
			zonelist->zones[j] = NULL;
		}
	}
}
#else	/* CONFIG_NUMA */
static void __init build_zonelists(pg_data_t *pgdat)
{
	int i, j, k, node, local_node;
	local_node = pgdat->node_id;
	for (i = 0; i < GFP_ZONETYPES; i++) {
		struct zonelist *zonelist;
		zonelist = pgdat->node_zonelists + i;
		j = 0;
		k = ZONE_NORMAL;
		if (i & __GFP_HIGHMEM)
			k = ZONE_HIGHMEM;
		if (i & __GFP_DMA)
			k = ZONE_DMA;
 		j = build_zonelists_node(pgdat, zonelist, j, k);
 		/*
 		 * Now we build the zonelist so that it contains the zones
 		 * of all the other nodes.
 		 * We don't want to pressure a particular node, so when
 		 * building the zones for node N, we make sure that the
 		 * zones coming right after the local ones are those from
 		 * node N+1 (modulo N)
 		 */
		for (node = local_node + 1; node < MAX_NUMNODES; node++) {
			if (!node_online(node))
				continue;
			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
		}
		for (node = 0; node < local_node; node++) {
			if (!node_online(node))
				continue;
			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
		}
		zonelist->zones[j] = NULL;
	}
}
#endif	/* CONFIG_NUMA */
void __init build_all_zonelists(void)
{
	int i;
	for_each_online_node(i)
		build_zonelists(NODE_DATA(i));
	printk("Built %i zonelists\n", num_online_nodes());
	cpuset_init_current_mems_allowed();
}
/*
 * Helper functions to size the waitqueue hash table.
 * Essentially these want to choose hash table sizes sufficiently
 * large so that collisions trying to wait on pages are rare.
 * But in fact, the number of active page waitqueues on typical
 * systems is ridiculously low, less than 200. So this is even
 * conservative, even though it seems large.
 *
 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
 * waitqueues, i.e. the size of the waitq table given the number of pages.
 */
#define PAGES_PER_WAITQUEUE	256
static inline unsigned long wait_table_size(unsigned long pages)
{
	unsigned long size = 1;
	pages /= PAGES_PER_WAITQUEUE;
	while (size < pages)
		size <<= 1;
	/*
	 * Once we have dozens or even hundreds of threads sleeping
	 * on IO we've got bigger problems than wait queue collision.
	 * Limit the size of the wait table to a reasonable size.
	 */
	size = min(size, 4096UL);
	return max(size, 4UL);
}
/*
 * This is an integer logarithm so that shifts can be used later
 * to extract the more random high bits from the multiplicative
 * hash function before the remainder is taken.
 */
static inline unsigned long wait_table_bits(unsigned long size)
{
	return ffz(~size);
}
#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
		unsigned long *zones_size, unsigned long *zholes_size)
{
	unsigned long realtotalpages, totalpages = 0;
	int i;
	for (i = 0; i < MAX_NR_ZONES; i++)
		totalpages += zones_size[i];
	pgdat->node_spanned_pages = totalpages;
	realtotalpages = totalpages;
	if (zholes_size)
		for (i = 0; i < MAX_NR_ZONES; i++)
			realtotalpages -= zholes_size[i];
	pgdat->node_present_pages = realtotalpages;
	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
}
/*
 * Initially all pages are reserved - free ones are freed
 * up by free_all_bootmem() once the early boot process is
 * done. Non-atomic initialization, single-pass.
 */
void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
		unsigned long start_pfn)
{
	struct page *start = pfn_to_page(start_pfn);
	struct page *page;
	for (page = start; page < (start + size); page++) {
		set_page_zone(page, NODEZONE(nid, zone));
		set_page_count(page, 0);
		reset_page_mapcount(page);
		SetPageReserved(page);
		INIT_LIST_HEAD(&page->lru);
#ifdef WANT_PAGE_VIRTUAL
		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
		if (!is_highmem_idx(zone))
			set_page_address(page, __va(start_pfn << PAGE_SHIFT));
#endif
		start_pfn++;
	}
}
void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
				unsigned long size)
{
	int order;
	for (order = 0; order < MAX_ORDER ; order++) {
		INIT_LIST_HEAD(&zone->free_area[order].free_list);
		zone->free_area[order].nr_free = 0;
	}
}
#ifndef __HAVE_ARCH_MEMMAP_INIT
#define memmap_init(size, nid, zone, start_pfn) \
	memmap_init_zone((size), (nid), (zone), (start_pfn))
#endif
/*
 * Set up the zone data structures:
 *   - mark all pages reserved
 *   - mark all memory queues empty
 *   - clear the memory bitmaps
 */
static void __init free_area_init_core(struct pglist_data *pgdat,
		unsigned long *zones_size, unsigned long *zholes_size)
{
	unsigned long i, j;
	const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
	int cpu, nid = pgdat->node_id;
	unsigned long zone_start_pfn = pgdat->node_start_pfn;
	pgdat->nr_zones = 0;
	init_waitqueue_head(&pgdat->kswapd_wait);
	pgdat->kswapd_max_order = 0;
	
	for (j = 0; j < MAX_NR_ZONES; j++) {
		struct zone *zone = pgdat->node_zones + j;
		unsigned long size, realsize;
		unsigned long batch;
		zone_table[NODEZONE(nid, j)] = zone;
		realsize = size = zones_size[j];
		if (zholes_size)
			realsize -= zholes_size[j];
		if (j == ZONE_DMA || j == ZONE_NORMAL)
			nr_kernel_pages += realsize;
		nr_all_pages += realsize;
		zone->spanned_pages = size;
		zone->present_pages = realsize;
		zone->name = zone_names[j];
		spin_lock_init(&zone->lock);
		spin_lock_init(&zone->lru_lock);
		zone->zone_pgdat = pgdat;
		zone->free_pages = 0;
		zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
		/*
		 * The per-cpu-pages pools are set to around 1000th of the
		 * size of the zone.  But no more than 1/4 of a meg - there's
		 * no point in going beyond the size of L2 cache.
		 *
		 * OK, so we don't know how big the cache is.  So guess.
		 */
		batch = zone->present_pages / 1024;
		if (batch * PAGE_SIZE > 256 * 1024)
			batch = (256 * 1024) / PAGE_SIZE;
		batch /= 4;		/* We effectively *= 4 below */
		if (batch < 1)
			batch = 1;
		/*
		 * Clamp the batch to a 2^n - 1 value. Having a power
		 * of 2 value was found to be more likely to have
		 * suboptimal cache aliasing properties in some cases.
		 *
		 * For example if 2 tasks are alternately allocating
		 * batches of pages, one task can end up with a lot
		 * of pages of one half of the possible page colors
		 * and the other with pages of the other colors.
		 */
		batch = (1 << fls(batch + batch/2)) - 1;
		for (cpu = 0; cpu < NR_CPUS; cpu++) {
			struct per_cpu_pages *pcp;
			pcp = &zone->pageset[cpu].pcp[0];	/* hot */
			pcp->count = 0;
			pcp->low = 2 * batch;
			pcp->high = 6 * batch;
			pcp->batch = 1 * batch;
			INIT_LIST_HEAD(&pcp->list);
			pcp = &zone->pageset[cpu].pcp[1];	/* cold */
			pcp->count = 0;
			pcp->low = 0;
			pcp->high = 2 * batch;
			pcp->batch = 1 * batch;
			INIT_LIST_HEAD(&pcp->list);
		}
		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
				zone_names[j], realsize, batch);
		INIT_LIST_HEAD(&zone->active_list);
		INIT_LIST_HEAD(&zone->inactive_list);
		zone->nr_scan_active = 0;
		zone->nr_scan_inactive = 0;
		zone->nr_active = 0;
		zone->nr_inactive = 0;
		if (!size)
			continue;
		/*
		 * The per-page waitqueue mechanism uses hashed waitqueues
		 * per zone.
		 */
		zone->wait_table_size = wait_table_size(size);
		zone->wait_table_bits =
			wait_table_bits(zone->wait_table_size);
		zone->wait_table = (wait_queue_head_t *)
			alloc_bootmem_node(pgdat, zone->wait_table_size
						* sizeof(wait_queue_head_t));
		for(i = 0; i < zone->wait_table_size; ++i)
			init_waitqueue_head(zone->wait_table + i);
		pgdat->nr_zones = j+1;
		zone->zone_mem_map = pfn_to_page(zone_start_pfn);
		zone->zone_start_pfn = zone_start_pfn;
		if ((zone_start_pfn) & (zone_required_alignment-1))
			printk(KERN_CRIT "BUG: wrong zone alignment, it will crash\n");
		memmap_init(size, nid, j, zone_start_pfn);
		zone_start_pfn += size;
		zone_init_free_lists(pgdat, zone, zone->spanned_pages);
	}
}
static void __init alloc_node_mem_map(struct pglist_data *pgdat)
{
	unsigned long size;
	/* Skip empty nodes */
	if (!pgdat->node_spanned_pages)
		return;
	/* ia64 gets its own node_mem_map, before this, without bootmem */
	if (!pgdat->node_mem_map) {
		size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
		pgdat->node_mem_map = alloc_bootmem_node(pgdat, size);
	}
#ifndef CONFIG_DISCONTIGMEM
	/*
	 * With no DISCONTIG, the global mem_map is just set as node 0's
	 */
	if (pgdat == NODE_DATA(0))
		mem_map = NODE_DATA(0)->node_mem_map;
#endif
}
void __init free_area_init_node(int nid, struct pglist_data *pgdat,
		unsigned long *zones_size, unsigned long node_start_pfn,
		unsigned long *zholes_size)
{
	pgdat->node_id = nid;
	pgdat->node_start_pfn = node_start_pfn;
	calculate_zone_totalpages(pgdat, zones_size, zholes_size);
	alloc_node_mem_map(pgdat);
	free_area_init_core(pgdat, zones_size, zholes_size);
}
#ifndef CONFIG_DISCONTIGMEM
static bootmem_data_t contig_bootmem_data;
struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
EXPORT_SYMBOL(contig_page_data);
void __init free_area_init(unsigned long *zones_size)
{
	free_area_init_node(0, &contig_page_data, zones_size,
			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
}
#endif
#ifdef CONFIG_PROC_FS
#include <linux/seq_file.h>
static void *frag_start(struct seq_file *m, loff_t *pos)
{
	pg_data_t *pgdat;
	loff_t node = *pos;
	for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next)
		--node;
	return pgdat;
}
static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
{
	pg_data_t *pgdat = (pg_data_t *)arg;
	(*pos)++;
	return pgdat->pgdat_next;
}
static void frag_stop(struct seq_file *m, void *arg)
{
}
/* 
 * This walks the free areas for each zone.
 */
static int frag_show(struct seq_file *m, void *arg)
{
	pg_data_t *pgdat = (pg_data_t *)arg;
	struct zone *zone;
	struct zone *node_zones = pgdat->node_zones;
	unsigned long flags;
	int order;
	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
		if (!zone->present_pages)
			continue;
		spin_lock_irqsave(&zone->lock, flags);
		seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
		for (order = 0; order < MAX_ORDER; ++order)
			seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
		spin_unlock_irqrestore(&zone->lock, flags);
		seq_putc(m, '\n');
	}
	return 0;
}
struct seq_operations fragmentation_op = {
	.start	= frag_start,
	.next	= frag_next,
	.stop	= frag_stop,
	.show	= frag_show,
};
static char *vmstat_text[] = {
	"nr_dirty",
	"nr_writeback",
	"nr_unstable",
	"nr_page_table_pages",
	"nr_mapped",
	"nr_slab",
	"pgpgin",
	"pgpgout",
	"pswpin",
	"pswpout",
	"pgalloc_high",
	"pgalloc_normal",
	"pgalloc_dma",
	"pgfree",
	"pgactivate",
	"pgdeactivate",
	"pgfault",
	"pgmajfault",
	"pgrefill_high",
	"pgrefill_normal",
	"pgrefill_dma",
	"pgsteal_high",
	"pgsteal_normal",
	"pgsteal_dma",
	"pgscan_kswapd_high",
	"pgscan_kswapd_normal",
	"pgscan_kswapd_dma",
	"pgscan_direct_high",
	"pgscan_direct_normal",
	"pgscan_direct_dma",
	"pginodesteal",
	"slabs_scanned",
	"kswapd_steal",
	"kswapd_inodesteal",
	"pageoutrun",
	"allocstall",
	"pgrotated",
	"nr_bounce",
};
static void *vmstat_start(struct seq_file *m, loff_t *pos)
{
	struct page_state *ps;
	if (*pos >= ARRAY_SIZE(vmstat_text))
		return NULL;
	ps = kmalloc(sizeof(*ps), GFP_KERNEL);
	m->private = ps;
	if (!ps)
		return ERR_PTR(-ENOMEM);
	get_full_page_state(ps);
	ps->pgpgin /= 2;		/* sectors -> kbytes */
	ps->pgpgout /= 2;
	return (unsigned long *)ps + *pos;
}
static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
{
	(*pos)++;
	if (*pos >= ARRAY_SIZE(vmstat_text))
		return NULL;
	return (unsigned long *)m->private + *pos;
}
static int vmstat_show(struct seq_file *m, void *arg)
{
	unsigned long *l = arg;
	unsigned long off = l - (unsigned long *)m->private;
	seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
	return 0;
}
static void vmstat_stop(struct seq_file *m, void *arg)
{
	kfree(m->private);
	m->private = NULL;
}
struct seq_operations vmstat_op = {
	.start	= vmstat_start,
	.next	= vmstat_next,
	.stop	= vmstat_stop,
	.show	= vmstat_show,
};
#endif /* CONFIG_PROC_FS */
#ifdef CONFIG_HOTPLUG_CPU
static int page_alloc_cpu_notify(struct notifier_block *self,
				 unsigned long action, void *hcpu)
{
	int cpu = (unsigned long)hcpu;
	long *count;
	unsigned long *src, *dest;
	if (action == CPU_DEAD) {
		int i;
		/* Drain local pagecache count. */
		count = &per_cpu(nr_pagecache_local, cpu);
		atomic_add(*count, &nr_pagecache);
		*count = 0;
		local_irq_disable();
		__drain_pages(cpu);
		/* Add dead cpu's page_states to our own. */
		dest = (unsigned long *)&__get_cpu_var(page_states);
		src = (unsigned long *)&per_cpu(page_states, cpu);
		for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long);
				i++) {
			dest[i] += src[i];
			src[i] = 0;
		}
		local_irq_enable();
	}
	return NOTIFY_OK;
}
#endif /* CONFIG_HOTPLUG_CPU */
void __init page_alloc_init(void)
{
	hotcpu_notifier(page_alloc_cpu_notify, 0);
}
/*
 * setup_per_zone_lowmem_reserve - called whenever
 *	sysctl_lower_zone_reserve_ratio changes.  Ensures that each zone
 *	has a correct pages reserved value, so an adequate number of
 *	pages are left in the zone after a successful __alloc_pages().
 */
static void setup_per_zone_lowmem_reserve(void)
{
	struct pglist_data *pgdat;
	int j, idx;
	for_each_pgdat(pgdat) {
		for (j = 0; j < MAX_NR_ZONES; j++) {
			struct zone *zone = pgdat->node_zones + j;
			unsigned long present_pages = zone->present_pages;
			zone->lowmem_reserve[j] = 0;
			for (idx = j-1; idx >= 0; idx--) {
				struct zone *lower_zone;
				if (sysctl_lowmem_reserve_ratio[idx] < 1)
					sysctl_lowmem_reserve_ratio[idx] = 1;
				lower_zone = pgdat->node_zones + idx;
				lower_zone->lowmem_reserve[j] = present_pages /
					sysctl_lowmem_reserve_ratio[idx];
				present_pages += lower_zone->present_pages;
			}
		}
	}
}
/*
 * setup_per_zone_pages_min - called when min_free_kbytes changes.  Ensures 
 *	that the pages_{min,low,high} values for each zone are set correctly 
 *	with respect to min_free_kbytes.
 */
static void setup_per_zone_pages_min(void)
{
	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
	unsigned long lowmem_pages = 0;
	struct zone *zone;
	unsigned long flags;
	/* Calculate total number of !ZONE_HIGHMEM pages */
	for_each_zone(zone) {
		if (!is_highmem(zone))
			lowmem_pages += zone->present_pages;
	}
	for_each_zone(zone) {
		spin_lock_irqsave(&zone->lru_lock, flags);
		if (is_highmem(zone)) {
			/*
			 * Often, highmem doesn't need to reserve any pages.
			 * But the pages_min/low/high values are also used for
			 * batching up page reclaim activity so we need a
			 * decent value here.
			 */
			int min_pages;
			min_pages = zone->present_pages / 1024;
			if (min_pages < SWAP_CLUSTER_MAX)
				min_pages = SWAP_CLUSTER_MAX;
			if (min_pages > 128)
				min_pages = 128;
			zone->pages_min = min_pages;
		} else {
			/* if it's a lowmem zone, reserve a number of pages 
			 * proportionate to the zone's size.
			 */
			zone->pages_min = (pages_min * zone->present_pages) / 
			                   lowmem_pages;
		}
		/*
		 * When interpreting these watermarks, just keep in mind that:
		 * zone->pages_min == (zone->pages_min * 4) / 4;
		 */
		zone->pages_low   = (zone->pages_min * 5) / 4;
		zone->pages_high  = (zone->pages_min * 6) / 4;
		spin_unlock_irqrestore(&zone->lru_lock, flags);
	}
}
/*
 * Initialise min_free_kbytes.
 *
 * For small machines we want it small (128k min).  For large machines
 * we want it large (64MB max).  But it is not linear, because network
 * bandwidth does not increase linearly with machine size.  We use
 *
 * 	min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
 *	min_free_kbytes = sqrt(lowmem_kbytes * 16)
 *
 * which yields
 *
 * 16MB:	512k
 * 32MB:	724k
 * 64MB:	1024k
 * 128MB:	1448k
 * 256MB:	2048k
 * 512MB:	2896k
 * 1024MB:	4096k
 * 2048MB:	5792k
 * 4096MB:	8192k
 * 8192MB:	11584k
 * 16384MB:	16384k
 */
static int __init init_per_zone_pages_min(void)
{
	unsigned long lowmem_kbytes;
	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
	min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
	if (min_free_kbytes < 128)
		min_free_kbytes = 128;
	if (min_free_kbytes > 65536)
		min_free_kbytes = 65536;
	setup_per_zone_pages_min();
	setup_per_zone_lowmem_reserve();
	return 0;
}
module_init(init_per_zone_pages_min)
/*
 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 
 *	that we can call two helper functions whenever min_free_kbytes
 *	changes.
 */
int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
{
	proc_dointvec(table, write, file, buffer, length, ppos);
	setup_per_zone_pages_min();
	return 0;
}
/*
 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
 *	proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
 *	whenever sysctl_lowmem_reserve_ratio changes.
 *
 * The reserve ratio obviously has absolutely no relation with the
 * pages_min watermarks. The lowmem reserve ratio can only make sense
 * if in function of the boot time zone sizes.
 */
int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
{
	proc_dointvec_minmax(table, write, file, buffer, length, ppos);
	setup_per_zone_lowmem_reserve();
	return 0;
}
__initdata int hashdist = HASHDIST_DEFAULT;
#ifdef CONFIG_NUMA
static int __init set_hashdist(char *str)
{
	if (!str)
		return 0;
	hashdist = simple_strtoul(str, &str, 0);
	return 1;
}
__setup("hashdist=", set_hashdist);
#endif
/*
 * allocate a large system hash table from bootmem
 * - it is assumed that the hash table must contain an exact power-of-2
 *   quantity of entries
 * - limit is the number of hash buckets, not the total allocation size
 */
void *__init alloc_large_system_hash(const char *tablename,
				     unsigned long bucketsize,
				     unsigned long numentries,
				     int scale,
				     int flags,
				     unsigned int *_hash_shift,
				     unsigned int *_hash_mask,
				     unsigned long limit)
{
	unsigned long long max = limit;
	unsigned long log2qty, size;
	void *table = NULL;
	/* allow the kernel cmdline to have a say */
	if (!numentries) {
		/* round applicable memory size up to nearest megabyte */
		numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages;
		numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
		numentries >>= 20 - PAGE_SHIFT;
		numentries <<= 20 - PAGE_SHIFT;
		/* limit to 1 bucket per 2^scale bytes of low memory */
		if (scale > PAGE_SHIFT)
			numentries >>= (scale - PAGE_SHIFT);
		else
			numentries <<= (PAGE_SHIFT - scale);
	}
	/* rounded up to nearest power of 2 in size */
	numentries = 1UL << (long_log2(numentries) + 1);
	/* limit allocation size to 1/16 total memory by default */
	if (max == 0) {
		max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
		do_div(max, bucketsize);
	}
	if (numentries > max)
		numentries = max;
	log2qty = long_log2(numentries);
	do {
		size = bucketsize << log2qty;
		if (flags & HASH_EARLY)
			table = alloc_bootmem(size);
		else if (hashdist)
			table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
		else {
			unsigned long order;
			for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
				;
			table = (void*) __get_free_pages(GFP_ATOMIC, order);
		}
	} while (!table && size > PAGE_SIZE && --log2qty);
	if (!table)
		panic("Failed to allocate %s hash table\n", tablename);
	printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
	       tablename,
	       (1U << log2qty),
	       long_log2(size) - PAGE_SHIFT,
	       size);
	if (_hash_shift)
		*_hash_shift = log2qty;
	if (_hash_mask)
		*_hash_mask = (1 << log2qty) - 1;
	return table;
}