diff -urN /md0/kernels/2.4/v2.4.9-ac14/@ aio-v2.4.9-ac14.diff/@ --- /md0/kernels/2.4/v2.4.9-ac14/@ Wed Dec 31 19:00:00 1969 +++ aio-v2.4.9-ac14.diff/@ Mon Sep 24 19:09:13 2001 @@ -0,0 +1,397 @@ +/* + md_k.h : kernel internal structure of the Linux MD driver + Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#ifndef _MD_K_H +#define _MD_K_H + +#include // for panic() + +#define MD_RESERVED 0UL +#define LINEAR 1UL +#define STRIPED 2UL +#define RAID0 STRIPED +#define RAID1 3UL +#define RAID5 4UL +#define TRANSLUCENT 5UL +#define HSM 6UL +#define MAX_PERSONALITY 7UL + +static inline int pers_to_level (int pers) +{ + switch (pers) { + case HSM: return -3; + case TRANSLUCENT: return -2; + case LINEAR: return -1; + case RAID0: return 0; + case RAID1: return 1; + case RAID5: return 5; + } + panic("pers_to_level()"); + return 0; +} + +static inline int level_to_pers (int level) +{ + switch (level) { + case -3: return HSM; + case -2: return TRANSLUCENT; + case -1: return LINEAR; + case 0: return RAID0; + case 1: return RAID1; + case 4: + case 5: return RAID5; + } + return MD_RESERVED; +} + +typedef struct mddev_s mddev_t; +typedef struct mdk_rdev_s mdk_rdev_t; + +#if (MINORBITS != 8) +#error MD doesnt handle bigger kdev yet +#endif + +#define MAX_MD_DEVS (1<state & (1 << MD_DISK_FAULTY); +} + +static inline int disk_active(mdp_disk_t * d) +{ + return d->state & (1 << MD_DISK_ACTIVE); +} + +static inline int disk_sync(mdp_disk_t * d) +{ + return d->state & (1 << MD_DISK_SYNC); +} + +static inline int disk_spare(mdp_disk_t * d) +{ + return !disk_sync(d) && !disk_active(d) && !disk_faulty(d); +} + +static inline int disk_removed(mdp_disk_t * d) +{ + return d->state & (1 << MD_DISK_REMOVED); +} + +static inline void mark_disk_faulty(mdp_disk_t * d) +{ + d->state |= (1 << MD_DISK_FAULTY); +} + +static inline void mark_disk_active(mdp_disk_t * d) +{ + d->state |= (1 << MD_DISK_ACTIVE); +} + +static inline void mark_disk_sync(mdp_disk_t * d) +{ + d->state |= (1 << MD_DISK_SYNC); +} + +static inline void mark_disk_spare(mdp_disk_t * d) +{ + d->state = 0; +} + +static inline void mark_disk_removed(mdp_disk_t * d) +{ + d->state = (1 << MD_DISK_FAULTY) | (1 << MD_DISK_REMOVED); +} + +static inline void mark_disk_inactive(mdp_disk_t * d) +{ + d->state &= ~(1 << MD_DISK_ACTIVE); +} + +static inline void mark_disk_nonsync(mdp_disk_t * d) +{ + d->state &= ~(1 << MD_DISK_SYNC); +} + +/* + * MD's 'extended' device + */ +struct mdk_rdev_s +{ + struct md_list_head same_set; /* RAID devices within the same set */ + struct md_list_head all; /* all RAID devices */ + struct md_list_head pending; /* undetected RAID devices */ + + kdev_t dev; /* Device number */ + kdev_t old_dev; /* "" when it was last imported */ + unsigned long size; /* Device size (in blocks) */ + mddev_t *mddev; /* RAID array if running */ + unsigned long last_events; /* IO event timestamp */ + + struct block_device *bdev; /* block device handle */ + + mdp_super_t *sb; + unsigned long sb_offset; + + int faulty; /* if faulty do not issue IO requests */ + int desc_nr; /* descriptor index in the superblock */ +}; + + +/* + * disk operations in a working array: + */ +#define DISKOP_SPARE_INACTIVE 0 +#define DISKOP_SPARE_WRITE 1 +#define DISKOP_SPARE_ACTIVE 2 +#define DISKOP_HOT_REMOVE_DISK 3 +#define DISKOP_HOT_ADD_DISK 4 + +typedef struct mdk_personality_s mdk_personality_t; + +struct mddev_s +{ + void *private; + mdk_personality_t *pers; + int __minor; + mdp_super_t *sb; + int nb_dev; + struct md_list_head disks; + int sb_dirty; + mdu_param_t param; + int ro; + unsigned long curr_resync; /* blocks scheduled */ + unsigned long resync_mark; /* a recent timestamp */ + unsigned long resync_mark_cnt;/* blocks written at resync_mark */ + char *name; + int recovery_running; + struct semaphore reconfig_sem; + struct semaphore recovery_sem; + struct semaphore resync_sem; + atomic_t active; + + atomic_t recovery_active; /* blocks scheduled, but not written */ + md_wait_queue_head_t recovery_wait; + + struct md_list_head all_mddevs; +}; + +struct mdk_personality_s +{ + char *name; + int (*make_request)(mddev_t *mddev, int rw, struct buffer_head * bh); + int (*run)(mddev_t *mddev); + int (*stop)(mddev_t *mddev); + int (*status)(char *page, mddev_t *mddev); + int (*error_handler)(mddev_t *mddev, kdev_t dev); + +/* + * Some personalities (RAID-1, RAID-5) can have disks hot-added and + * hot-removed. Hot removal is different from failure. (failure marks + * a disk inactive, but the disk is still part of the array) The interface + * to such operations is the 'pers->diskop()' function, can be NULL. + * + * the diskop function can change the pointer pointing to the incoming + * descriptor, but must do so very carefully. (currently only + * SPARE_ACTIVE expects such a change) + */ + int (*diskop) (mddev_t *mddev, mdp_disk_t **descriptor, int state); + + int (*stop_resync)(mddev_t *mddev); + int (*restart_resync)(mddev_t *mddev); + int (*sync_request)(mddev_t *mddev, unsigned long block_nr); +}; + + +/* + * Currently we index md_array directly, based on the minor + * number. This will have to change to dynamic allocation + * once we start supporting partitioning of md devices. + */ +static inline int mdidx (mddev_t * mddev) +{ + return mddev->__minor; +} + +static inline kdev_t mddev_to_kdev(mddev_t * mddev) +{ + return MKDEV(MD_MAJOR, mdidx(mddev)); +} + +extern mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev); +extern mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr); + +/* + * iterates through some rdev ringlist. It's safe to remove the + * current 'rdev'. Dont touch 'tmp' though. + */ +#define ITERATE_RDEV_GENERIC(head,field,rdev,tmp) \ + \ + for (tmp = head.next; \ + rdev = md_list_entry(tmp, mdk_rdev_t, field), \ + tmp = tmp->next, tmp->prev != &head \ + ; ) +/* + * iterates through the 'same array disks' ringlist + */ +#define ITERATE_RDEV(mddev,rdev,tmp) \ + ITERATE_RDEV_GENERIC((mddev)->disks,same_set,rdev,tmp) + +/* + * Same as above, but assumes that the device has rdev->desc_nr numbered + * from 0 to mddev->nb_dev, and iterates through rdevs in ascending order. + */ +#define ITERATE_RDEV_ORDERED(mddev,rdev,i) \ + for (i = 0; rdev = find_rdev_nr(mddev, i), i < mddev->nb_dev; i++) + + +/* + * Iterates through all 'RAID managed disks' + */ +#define ITERATE_RDEV_ALL(rdev,tmp) \ + ITERATE_RDEV_GENERIC(all_raid_disks,all,rdev,tmp) + +/* + * Iterates through 'pending RAID disks' + */ +#define ITERATE_RDEV_PENDING(rdev,tmp) \ + ITERATE_RDEV_GENERIC(pending_raid_disks,pending,rdev,tmp) + +/* + * iterates through all used mddevs in the system. + */ +#define ITERATE_MDDEV(mddev,tmp) \ + \ + for (tmp = all_mddevs.next; \ + mddev = md_list_entry(tmp, mddev_t, all_mddevs), \ + tmp = tmp->next, tmp->prev != &all_mddevs \ + ; ) + +static inline int lock_mddev (mddev_t * mddev) +{ + return down_interruptible(&mddev->reconfig_sem); +} + +static inline void unlock_mddev (mddev_t * mddev) +{ + up(&mddev->reconfig_sem); +} + +#define xchg_values(x,y) do { __typeof__(x) __tmp = x; \ + x = y; y = __tmp; } while (0) + +typedef struct mdk_thread_s { + void (*run) (void *data); + void *data; + md_wait_queue_head_t wqueue; + unsigned long flags; + struct completion *event; + struct task_struct *tsk; + const char *name; +} mdk_thread_t; + +#define THREAD_WAKEUP 0 + +#define MAX_DISKNAME_LEN 64 + +typedef struct dev_name_s { + struct md_list_head list; + kdev_t dev; + char namebuf [MAX_DISKNAME_LEN]; + char *name; +} dev_name_t; + + +#define __wait_event_lock_irq(wq, condition, lock) \ +do { \ + wait_queue_t __wait; \ + init_waitqueue_entry(&__wait, current); \ + \ + add_wait_queue(&wq, &__wait); \ + for (;;) { \ + set_current_state(TASK_UNINTERRUPTIBLE); \ + if (condition) \ + break; \ + spin_unlock_irq(&lock); \ + run_task_queue(&tq_disk); \ + schedule(); \ + spin_lock_irq(&lock); \ + } \ + current->state = TASK_RUNNING; \ + remove_wait_queue(&wq, &__wait); \ +} while (0) + +#define wait_event_lock_irq(wq, condition, lock) \ +do { \ + if (condition) \ + break; \ + __wait_event_lock_irq(wq, condition, lock); \ +} while (0) + + +#define __wait_disk_event(wq, condition) \ +do { \ + wait_queue_t __wait; \ + init_waitqueue_entry(&__wait, current); \ + \ + add_wait_queue(&wq, &__wait); \ + for (;;) { \ + set_current_state(TASK_UNINTERRUPTIBLE); \ + if (condition) \ + break; \ + run_task_queue(&tq_disk); \ + schedule(); \ + } \ + current->state = TASK_RUNNING; \ + remove_wait_queue(&wq, &__wait); \ +} while (0) + +#define wait_disk_event(wq, condition) \ +do { \ + if (condition) \ + break; \ + __wait_disk_event(wq, condition); \ +} while (0) + +#endif + diff -urN /md0/kernels/2.4/v2.4.9-ac14/MAINTAINERS aio-v2.4.9-ac14.diff/MAINTAINERS --- /md0/kernels/2.4/v2.4.9-ac14/MAINTAINERS Mon Sep 24 02:14:12 2001 +++ aio-v2.4.9-ac14.diff/MAINTAINERS Mon Sep 24 19:09:13 2001 @@ -201,6 +201,12 @@ L: linux-net@vger.kernel.org S: Maintained +ASYNC IO +P: Benjamin LaHaise +M: bcrl@redhat.com +L: linux-aio@kvack.org +S: Maintained + AX.25 NETWORK LAYER P: Matthias Welwarsky M: dg2fef@afthd.tu-darmstadt.de diff -urN /md0/kernels/2.4/v2.4.9-ac14/Makefile aio-v2.4.9-ac14.diff/Makefile --- /md0/kernels/2.4/v2.4.9-ac14/Makefile Mon Sep 24 02:14:12 2001 +++ aio-v2.4.9-ac14.diff/Makefile Mon Sep 24 19:09:29 2001 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 4 SUBLEVEL = 9 -EXTRAVERSION = -ac14 +EXTRAVERSION = -ac14-aio KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION) diff -urN /md0/kernels/2.4/v2.4.9-ac14/arch/i386/kernel/entry.S aio-v2.4.9-ac14.diff/arch/i386/kernel/entry.S --- /md0/kernels/2.4/v2.4.9-ac14/arch/i386/kernel/entry.S Mon Sep 24 02:14:12 2001 +++ aio-v2.4.9-ac14.diff/arch/i386/kernel/entry.S Mon Sep 24 19:09:13 2001 @@ -626,6 +626,12 @@ .long SYMBOL_NAME(sys_getdents64) /* 220 */ .long SYMBOL_NAME(sys_fcntl64) .long SYMBOL_NAME(sys_ni_syscall) /* reserved for TUX */ + .long SYMBOL_NAME(sys___io_setup) /* 223 */ + .long SYMBOL_NAME(sys___io_destroy) + .long SYMBOL_NAME(sys___io_getevents) + .long SYMBOL_NAME(sys___io_submit) + .long SYMBOL_NAME(sys___io_cancel) + .long SYMBOL_NAME(sys___io_wait) .rept NR_syscalls-(.-sys_call_table)/4 .long SYMBOL_NAME(sys_ni_syscall) diff -urN /md0/kernels/2.4/v2.4.9-ac14/drivers/char/raw.c aio-v2.4.9-ac14.diff/drivers/char/raw.c --- /md0/kernels/2.4/v2.4.9-ac14/drivers/char/raw.c Mon Sep 24 02:14:14 2001 +++ aio-v2.4.9-ac14.diff/drivers/char/raw.c Mon Sep 24 21:33:09 2001 @@ -16,6 +16,8 @@ #include #include #include +#include +#include #define dprintk(x...) @@ -36,13 +38,18 @@ int raw_open(struct inode *, struct file *); int raw_release(struct inode *, struct file *); int raw_ctl_ioctl(struct inode *, struct file *, unsigned int, unsigned long); - +int raw_kvec_read(struct file *filp, kvec_cb_t cb, size_t size, loff_t pos); +int raw_kvec_write(struct file *filp, kvec_cb_t cb, size_t size, loff_t pos); static struct file_operations raw_fops = { read: raw_read, write: raw_write, open: raw_open, release: raw_release, + aio_read: generic_file_aio_read, + aio_write: generic_file_aio_write, + kvec_read: raw_kvec_read, + kvec_write: raw_kvec_write, }; static struct file_operations raw_ctl_fops = { @@ -130,7 +137,7 @@ * the blocksize on a device which is already mounted. */ - sector_size = 512; + sector_size = 2048; if (is_mounted(rdev)) { if (blksize_size[MAJOR(rdev)]) sector_size = blksize_size[MAJOR(rdev)][MINOR(rdev)]; @@ -260,7 +267,6 @@ } - ssize_t raw_read(struct file *filp, char * buf, size_t size, loff_t *offp) { @@ -393,3 +399,83 @@ out: return err; } + +static int raw_kvec_rw(struct file *filp, int rw, kvec_cb_t cb, size_t size, loff_t pos); +int raw_kvec_read(struct file *file, kvec_cb_t cb, size_t size, loff_t pos) +{ + return raw_kvec_rw(file, READ, cb, size, pos); +} + +int raw_kvec_write(struct file *file, kvec_cb_t cb, size_t size, loff_t pos) +{ + return raw_kvec_rw(file, WRITE, cb, size, pos); +} + +int raw_kvec_rw(struct file *filp, int rw, kvec_cb_t cb, size_t size, loff_t pos) +{ + int err; + unsigned minor; + kdev_t dev; + unsigned long limit, blocknr, blocks; + + unsigned sector_size, sector_bits, sector_mask; + unsigned max_sectors; + +printk("raw: cb.kvec=%p\n", cb.vec); + pr_debug("raw_rw_kiovec: %p %d %d %p %d %d %Lu\n", filp, rw, nr, kiovec, flags, size, pos); + /* + * First, a few checks on device size limits + */ + + minor = MINOR(filp->f_dentry->d_inode->i_rdev); + dev = to_kdev_t(raw_devices[minor].binding->bd_dev); + sector_size = raw_devices[minor].sector_size; + sector_bits = raw_devices[minor].sector_bits; + sector_mask = sector_size- 1; + max_sectors = 25000; //KIO_MAX_SECTORS >> (sector_bits - 9); + + if (blk_size[MAJOR(dev)]) + limit = (((loff_t) blk_size[MAJOR(dev)][MINOR(dev)]) << BLOCK_SIZE_BITS) >> sector_bits; + else + limit = INT_MAX; + dprintk ("rw_raw_dev_async: dev %d:%d (+%d)\n", + MAJOR(dev), MINOR(dev), limit); + + err = -EINVAL; + if ((pos < 0) || (pos & sector_mask) || (size & sector_mask)) { + printk("pos/size wrong\n"); + goto out; + } + + err = -ENXIO; + if ((pos >> sector_bits) >= limit) { + printk("raw: %Lu > %lu, %d\n", pos >> sector_bits, limit, sector_bits); + goto out; + } + + /* + * Split the IO into KIO_MAX_SECTORS chunks, mapping and + * unmapping the single kiobuf as we go to perform each chunk of + * IO. + */ + + blocknr = pos >> sector_bits; + blocks = size >> sector_bits; + if (blocks > max_sectors) + blocks = max_sectors; + if (blocks > limit - blocknr) + blocks = limit - blocknr; + err = -ENXIO; + pr_debug("raw: !blocks %d %ld %ld\n", max_sectors, limit, blocknr); + if (!blocks) + goto out; + +printk("raw: cb.kvec=%p\n", cb.vec); + err = brw_kvec_async(rw, cb, dev, blocks, blocknr, sector_bits); + pr_debug("brw_kiovec_async: %d\n", err); + +out: + pr_debug("brw_kiovec_async: ret is %d\n", err); + return err; +} + diff -urN /md0/kernels/2.4/v2.4.9-ac14/drivers/net/ns83820.c aio-v2.4.9-ac14.diff/drivers/net/ns83820.c --- /md0/kernels/2.4/v2.4.9-ac14/drivers/net/ns83820.c Mon Sep 24 02:14:14 2001 +++ aio-v2.4.9-ac14.diff/drivers/net/ns83820.c Tue Sep 25 15:14:59 2001 @@ -1,7 +1,7 @@ -#define VERSION "0.11" +#define VERSION "0.12pre" /* ns83820.c by Benjamin LaHaise * - * $Revision: 1.34.2.2 $ + * $Revision: 1.34.2.7 $ * * Copyright 2001 Benjamin LaHaise. * Copyright 2001 Red Hat. @@ -41,7 +41,9 @@ * 20010827 0.10 - fix ia64 unaligned access. * 20010906 0.11 - accept all packets with checksum errors as * otherwise fragments get lost - - fix >> 32 bugs + * - fix >> 32 bugs + * 0.12 - add statistics counters + * not yet- add multicast support * * Driver Overview * =============== @@ -61,7 +63,9 @@ * Cameo SOHO-GA2000T SOHO-GA2500T * D-Link DGE-500T * PureData PDP8023Z-TG - * SMC SMC9462TX + * SMC SMC9452TX SMC9462TX + * + * Special thanks to SMC for providing hardware to test this driver on. * * Reports of success or failure would be greatly appreciated. */ @@ -80,16 +84,15 @@ #include /* for iph */ #include /* for IPPROTO_... */ #include +#include //#include +#include + /* Dprintk is used for more interesting debug events */ #undef Dprintk #define Dprintk dprintk -#if !defined(GCC_VERSION) || (GCC_VERSION < 2096) -#define __builtin_expect(x,y) (x) -#endif - #ifdef CONFIG_HIGHMEM64G #define USE_64BIT_ADDR #elif defined(__ia64__) @@ -367,6 +370,7 @@ struct ns83820 { struct net_device net_dev; + struct net_device_stats stats; u8 *base; struct pci_dev *pci_dev; @@ -733,39 +737,22 @@ kfree_skb(skb); skb = tmp; #endif + if (cmdsts & CMDSTS_DEST_MULTI) + dev->stats.multicast ++; + dev->stats.rx_packets ++; + dev->stats.rx_bytes += len; if ((extsts & 0x002a0000) && !(extsts & 0x00540000)) { skb->ip_summed = CHECKSUM_UNNECESSARY; } else { skb->ip_summed = CHECKSUM_NONE; } skb->protocol = eth_type_trans(skb, &dev->net_dev); - switch (netif_rx(skb)) { - case NET_RX_SUCCESS: - dev->ihr = 3; - break; - case NET_RX_CN_LOW: - dev->ihr = 3; - break; - case NET_RX_CN_MOD: - dev->ihr = dev->ihr + 1; - break; - case NET_RX_CN_HIGH: - dev->ihr += dev->ihr/2 + 1; - break; - case NET_RX_DROP: - dev->ihr = 255; - break; - } - if (dev->ihr > 255) - dev->ihr = 255; + if (NET_RX_DROP == netif_rx(skb)) + dev->stats.rx_dropped ++; #ifndef __i386__ done:; #endif } else { - static int err; - if (err++ < 20) { - Dprintk("error packet: cmdsts: %08x extsts: %08x\n", cmdsts, extsts); - } kfree_skb(skb); } @@ -808,6 +795,13 @@ !(CMDSTS_OWN & (cmdsts = desc[CMDSTS])) ) { struct sk_buff *skb; + if (cmdsts & CMDSTS_ERR) + dev->stats.tx_errors ++; + if (cmdsts & CMDSTS_OK) + dev->stats.tx_packets ++; + if (cmdsts & CMDSTS_OK) + dev->stats.tx_bytes += cmdsts & 0xffff; + dprintk("tx_done_idx=%d free_idx=%d cmdsts=%08x\n", tx_done_idx, dev->tx_free_idx, desc[CMDSTS]); skb = dev->tx_skbs[tx_done_idx]; @@ -986,6 +980,35 @@ return 0; } +static void ns83820_update_stats(struct ns83820 *dev) +{ + u8 *base = dev->base; + + dev->stats.rx_errors += readl(base + 0x60) & 0xffff; + dev->stats.rx_crc_errors += readl(base + 0x64) & 0xffff; + dev->stats.rx_missed_errors += readl(base + 0x68) & 0xffff; + dev->stats.rx_frame_errors += readl(base + 0x6c) & 0xffff; + /*dev->stats.rx_symbol_errors +=*/ readl(base + 0x70); + dev->stats.rx_length_errors += readl(base + 0x74) & 0xffff; + dev->stats.rx_length_errors += readl(base + 0x78) & 0xffff; + /*dev->stats.rx_badopcode_errors += */ readl(base + 0x7c); + /*dev->stats.rx_pause_count += */ readl(base + 0x80); + /*dev->stats.tx_pause_count += */ readl(base + 0x84); + dev->stats.tx_carrier_errors += readl(base + 0x88) & 0xff; +} + +static struct net_device_stats *ns83820_get_stats(struct net_device *_dev) +{ + struct ns83820 *dev = (void *)_dev; + + /* somewhat overkill */ + spin_lock_irq(&dev->misc_lock); + ns83820_update_stats(dev); + spin_unlock_irq(&dev->misc_lock); + + return &dev->stats; +} + static void ns83820_irq(int foo, void *data, struct pt_regs *regs) { struct ns83820 *dev = data; @@ -1061,6 +1084,12 @@ if ((ISR_TXDESC | ISR_TXIDLE) & isr) do_tx_done(dev); + if (ISR_MIB & isr) { + spin_lock(&dev->misc_lock); + ns83820_update_stats(dev); + spin_unlock(&dev->misc_lock); + } + if (ISR_PHY & isr) phy_intr(dev); } @@ -1179,6 +1208,28 @@ return 0; } +static void ns83820_set_multicast(struct net_device *_dev) +{ + struct ns83820 *dev = (void *)_dev; + u8 *rfcr = dev->base + RFCR; + u32 and_mask = 0xffffffff; + u32 or_mask = 0; + + if (dev->net_dev.flags & IFF_PROMISC) + or_mask |= RFCR_AAU | RFCR_AAM; + else + and_mask &= ~(RFCR_AAU | RFCR_AAM); + + if (dev->net_dev.flags & IFF_ALLMULTI) + or_mask |= RFCR_AAM; + else + and_mask &= ~RFCR_AAM; + + spin_lock_irq(&dev->misc_lock); + writel((readl(rfcr) & and_mask) | or_mask, rfcr); + spin_unlock_irq(&dev->misc_lock); +} + static int ns83820_probe(struct pci_dev *pci_dev, const struct pci_device_id *id) { struct ns83820 *dev; @@ -1242,6 +1293,9 @@ dev->net_dev.stop = ns83820_stop; dev->net_dev.hard_start_xmit = ns83820_hard_start_xmit; dev->net_dev.change_mtu = ns83820_change_mtu; + dev->net_dev.get_stats = ns83820_get_stats; + dev->net_dev.change_mtu = ns83820_change_mtu; + dev->net_dev.set_multicast_list = ns83820_set_multicast; //FIXME: dev->net_dev.tx_timeout = ns83820_tx_timeout; lock_kernel(); @@ -1425,5 +1479,6 @@ MODULE_AUTHOR("Benjamin LaHaise "); MODULE_DESCRIPTION("National Semiconductor DP83820 10/100/1000 driver"); MODULE_DEVICE_TABLE(pci, pci_device_id); +MODULE_LICENSE("GPL"); module_init(ns83820_init); module_exit(ns83820_exit); diff -urN /md0/kernels/2.4/v2.4.9-ac14/drivers/scsi/53c700-mem.c aio-v2.4.9-ac14.diff/drivers/scsi/53c700-mem.c --- /md0/kernels/2.4/v2.4.9-ac14/drivers/scsi/53c700-mem.c Wed Dec 31 19:00:00 1969 +++ aio-v2.4.9-ac14.diff/drivers/scsi/53c700-mem.c Mon Sep 24 19:16:27 2001 @@ -0,0 +1,1842 @@ +/* WARNING: GENERATED FILE (from 53c700.c), DO NOT MODIFY */ +#define MEM_MAPPED +/* -*- mode: c; c-basic-offset: 8 -*- */ + +/* NCR (or Symbios) 53c700 and 53c700-66 Driver + * + * Copyright (C) 2001 by James.Bottomley@HansenPartnership.com +**----------------------------------------------------------------------------- +** +** This program is free software; you can redistribute it and/or modify +** it under the terms of the GNU General Public License as published by +** the Free Software Foundation; either version 2 of the License, or +** (at your option) any later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software +** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +** +**----------------------------------------------------------------------------- + */ + +/* Notes: + * + * This driver is designed exclusively for these chips (virtually the + * earliest of the scripts engine chips). They need their own drivers + * because they are missing so many of the scripts and snazzy register + * features of their elder brothers (the 710, 720 and 770). + * + * The 700 is the lowliest of the line, it can only do async SCSI. + * The 700-66 can at least do synchronous SCSI up to 10MHz. + * + * The 700 chip has no host bus interface logic of its own. However, + * it is usually mapped to a location with well defined register + * offsets. Therefore, if you can determine the base address and the + * irq your board incorporating this chip uses, you can probably use + * this driver to run it (although you'll probably have to write a + * minimal wrapper for the purpose---see the NCR_D700 driver for + * details about how to do this). + * + * + * TODO List: + * + * 1. Better statistics in the proc fs + * + * 2. Implement message queue (queues SCSI messages like commands) and make + * the abort and device reset functions use them. + * */ + +/* CHANGELOG + * + * Version 2.3 + * + * More endianness/cache coherency changes. + * + * Better bad device handling (handles devices lying about tag + * queueing support and devices which fail to provide sense data on + * contingent allegiance conditions) + * + * Many thanks to Richard Hirst for patiently + * debugging this driver on the parisc architecture and suggesting + * many improvements and bug fixes. + * + * Thanks also go to Linuxcare Inc. for providing several PARISC + * machines for me to debug the driver on. + * + * Version 2.2 + * + * Made the driver mem or io mapped; added endian invariance; added + * dma cache flushing operations for architectures which need it; + * added support for more varied clocking speeds. + * + * Version 2.1 + * + * Initial modularisation from the D700. See NCR_D700.c for the rest of + * the changelog. + * */ +#define NCR_700_VERSION "2.3" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "scsi.h" +#include "hosts.h" +#include "constants.h" + +#include "53c700.h" + +#ifdef NCR_700_DEBUG +#define STATIC +#else +#define STATIC static +#endif + +MODULE_AUTHOR("James Bottomley"); +MODULE_DESCRIPTION("53c700 and 53c700-66 Driver"); +MODULE_LICENSE("GPL"); + +/* This is the script */ +#include "53c700_d.h" + + +STATIC int NCR_700_queuecommand(Scsi_Cmnd *, void (*done)(Scsi_Cmnd *)); +STATIC int NCR_700_abort(Scsi_Cmnd * SCpnt); +STATIC int NCR_700_bus_reset(Scsi_Cmnd * SCpnt); +STATIC int NCR_700_dev_reset(Scsi_Cmnd * SCpnt); +STATIC int NCR_700_host_reset(Scsi_Cmnd * SCpnt); +STATIC int NCR_700_proc_directory_info(char *, char **, off_t, int, int, int); +STATIC void NCR_700_chip_setup(struct Scsi_Host *host); +STATIC void NCR_700_chip_reset(struct Scsi_Host *host); + +static char *NCR_700_phase[] = { + "", + "after selection", + "before command phase", + "after command phase", + "after status phase", + "after data in phase", + "after data out phase", + "during data phase", +}; + +static char *NCR_700_condition[] = { + "", + "NOT MSG_OUT", + "UNEXPECTED PHASE", + "NOT MSG_IN", + "UNEXPECTED MSG", + "MSG_IN", + "SDTR_MSG RECEIVED", + "REJECT_MSG RECEIVED", + "DISCONNECT_MSG RECEIVED", + "MSG_OUT", + "DATA_IN", + +}; + +static char *NCR_700_fatal_messages[] = { + "unexpected message after reselection", + "still MSG_OUT after message injection", + "not MSG_IN after selection", + "Illegal message length received", +}; + +static char *NCR_700_SBCL_bits[] = { + "IO ", + "CD ", + "MSG ", + "ATN ", + "SEL ", + "BSY ", + "ACK ", + "REQ ", +}; + +static char *NCR_700_SBCL_to_phase[] = { + "DATA_OUT", + "DATA_IN", + "CMD_OUT", + "STATE", + "ILLEGAL PHASE", + "ILLEGAL PHASE", + "MSG OUT", + "MSG IN", +}; + +static __u8 NCR_700_SDTR_msg[] = { + 0x01, /* Extended message */ + 0x03, /* Extended message Length */ + 0x01, /* SDTR Extended message */ + NCR_700_MIN_PERIOD, + NCR_700_MAX_OFFSET +}; + +struct Scsi_Host * __init +NCR_700_detect(Scsi_Host_Template *tpnt, + struct NCR_700_Host_Parameters *hostdata) +{ + __u32 *script = kmalloc(sizeof(SCRIPT), GFP_KERNEL); + __u32 pScript; + struct Scsi_Host *host; + static int banner = 0; + int j; + + /* Fill in the missing routines from the host template */ + tpnt->queuecommand = NCR_700_queuecommand; + tpnt->eh_abort_handler = NCR_700_abort; + tpnt->eh_device_reset_handler = NCR_700_dev_reset; + tpnt->eh_bus_reset_handler = NCR_700_bus_reset; + tpnt->eh_host_reset_handler = NCR_700_host_reset; + tpnt->can_queue = NCR_700_COMMAND_SLOTS_PER_HOST; + tpnt->sg_tablesize = NCR_700_SG_SEGMENTS; + tpnt->cmd_per_lun = NCR_700_MAX_TAGS; + tpnt->use_clustering = DISABLE_CLUSTERING; + tpnt->use_new_eh_code = 1; + tpnt->proc_info = NCR_700_proc_directory_info; + + if(tpnt->name == NULL) + tpnt->name = "53c700"; + if(tpnt->proc_name == NULL) + tpnt->proc_name = "53c700"; + + + if((host = scsi_register(tpnt, 4)) == NULL) + return NULL; + if(script == NULL) { + printk(KERN_ERR "53c700: Failed to allocate script, detatching\n"); + scsi_unregister(host); + return NULL; + } + + hostdata->slots = kmalloc(sizeof(struct NCR_700_command_slot) * NCR_700_COMMAND_SLOTS_PER_HOST, GFP_KERNEL); + if(hostdata->slots == NULL) { + printk(KERN_ERR "53c700: Failed to allocate command slots, detatching\n"); + scsi_unregister(host); + return NULL; + } + memset(hostdata->slots, 0, sizeof(struct NCR_700_command_slot) * NCR_700_COMMAND_SLOTS_PER_HOST); + for(j = 0; j < NCR_700_COMMAND_SLOTS_PER_HOST; j++) { + if(j == 0) + hostdata->free_list = &hostdata->slots[j]; + else + hostdata->slots[j-1].ITL_forw = &hostdata->slots[j]; + hostdata->slots[j].state = NCR_700_SLOT_FREE; + } + host->hostdata[0] = (__u32)hostdata; + for(j = 0; j < sizeof(SCRIPT)/sizeof(SCRIPT[0]); j++) { + script[j] = bS_to_host(SCRIPT[j]); + } + /* bus physical address of script */ + pScript = virt_to_bus(script); + /* adjust all labels to be bus physical */ + for(j = 0; j < PATCHES; j++) { + script[LABELPATCHES[j]] = bS_to_host(pScript + SCRIPT[LABELPATCHES[j]]); + } + /* now patch up fixed addresses */ + script_patch_32(script, MessageLocation, + virt_to_bus(&hostdata->msgout[0])); + script_patch_32(script, StatusAddress, + virt_to_bus(&hostdata->status)); + script_patch_32(script, ReceiveMsgAddress, + virt_to_bus(&hostdata->msgin[0])); + + hostdata->script = script; + hostdata->pScript = pScript; + hostdata->state = NCR_700_HOST_FREE; + spin_lock_init(&hostdata->lock); + hostdata->cmd = NULL; + host->max_id = 7; + host->max_lun = NCR_700_MAX_LUNS; + host->unique_id = hostdata->base; + host->base = hostdata->base; + host->hostdata[0] = (unsigned long)hostdata; + /* kick the chip */ + NCR_700_writeb(0xff, host, CTEST9_REG); + hostdata->rev = (NCR_700_readb(host, CTEST7_REG)<<4) & 0x0f; + hostdata->fast = (NCR_700_readb(host, CTEST9_REG) == 0); + if(banner == 0) { + printk(KERN_NOTICE "53c700: Version " NCR_700_VERSION " By James.Bottomley@HansenPartnership.com\n"); + banner = 1; + } + printk(KERN_NOTICE "scsi%d: %s rev %d %s\n", host->host_no, + hostdata->fast ? "53c700-66" : "53c700", + hostdata->rev, hostdata->differential ? + "(Differential)" : ""); + /* reset the chip */ + NCR_700_chip_reset(host); + NCR_700_writeb(ASYNC_OPERATION , host, SXFER_REG); + + return host; +} + +int +NCR_700_release(struct Scsi_Host *host) +{ + struct NCR_700_Host_Parameters *hostdata = + (struct NCR_700_Host_Parameters *)host->hostdata[0]; + + kfree(hostdata->script); + return 1; +} + +static inline __u8 +NCR_700_identify(int can_disconnect, __u8 lun) +{ + return IDENTIFY_BASE | + ((can_disconnect) ? 0x40 : 0) | + (lun & NCR_700_LUN_MASK); +} + +/* + * Function : static int datapath_residual (Scsi_Host *host) + * + * Purpose : return residual data count of what's in the chip. If you + * really want to know what this function is doing, it's almost a + * direct transcription of the algorithm described in the 53c710 + * guide, except that the DBC and DFIFO registers are only 6 bits + * wide. + * + * Inputs : host - SCSI host */ +static inline int +NCR_700_data_residual (struct Scsi_Host *host) { + int count, synchronous; + unsigned int ddir; + + count = ((NCR_700_readb(host, DFIFO_REG) & 0x3f) - + (NCR_700_readl(host, DBC_REG) & 0x3f)) & 0x3f; + + synchronous = NCR_700_readb(host, SXFER_REG) & 0x0f; + + /* get the data direction */ + ddir = NCR_700_readb(host, CTEST0_REG) & 0x01; + + if (ddir) { + /* Receive */ + if (synchronous) + count += (NCR_700_readb(host, SSTAT2_REG) & 0xf0) >> 4; + else + if (NCR_700_readb(host, SSTAT1_REG) & SIDL_REG_FULL) + ++count; + } else { + /* Send */ + __u8 sstat = NCR_700_readb(host, SSTAT1_REG); + if (sstat & SODL_REG_FULL) + ++count; + if (synchronous && (sstat & SODR_REG_FULL)) + ++count; + } + return count; +} + +/* print out the SCSI wires and corresponding phase from the SBCL register + * in the chip */ +static inline char * +sbcl_to_string(__u8 sbcl) +{ + int i; + static char ret[256]; + + ret[0]='\0'; + for(i=0; i<8; i++) { + if((1<free_list; + + if(slot == NULL) { + /* sanity check */ + if(hostdata->command_slot_count != NCR_700_COMMAND_SLOTS_PER_HOST) + printk(KERN_ERR "SLOTS FULL, but count is %d, should be %d\n", hostdata->command_slot_count, NCR_700_COMMAND_SLOTS_PER_HOST); + return NULL; + } + + if(slot->state != NCR_700_SLOT_FREE) + /* should panic! */ + printk(KERN_ERR "BUSY SLOT ON FREE LIST!!!\n"); + + + hostdata->free_list = slot->ITL_forw; + slot->ITL_forw = NULL; + + + /* NOTE: set the state to busy here, not queued, since this + * indicates the slot is in use and cannot be run by the IRQ + * finish routine. If we cannot queue the command when it + * is properly build, we then change to NCR_700_SLOT_QUEUED */ + slot->state = NCR_700_SLOT_BUSY; + hostdata->command_slot_count++; + + return slot; +} + +STATIC void +free_slot(struct NCR_700_command_slot *slot, + struct NCR_700_Host_Parameters *hostdata) +{ + int hash; + struct NCR_700_command_slot **forw, **back; + + + if((slot->state & NCR_700_SLOT_MASK) != NCR_700_SLOT_MAGIC) { + printk(KERN_ERR "53c700: SLOT %p is not MAGIC!!!\n", slot); + } + if(slot->state == NCR_700_SLOT_FREE) { + printk(KERN_ERR "53c700: SLOT %p is FREE!!!\n", slot); + } + /* remove from queues */ + if(slot->tag != NCR_700_NO_TAG) { + hash = hash_ITLQ(slot->cmnd->target, slot->cmnd->lun, + slot->tag); + if(slot->ITLQ_forw == NULL) + back = &hostdata->ITLQ_Hash_back[hash]; + else + back = &slot->ITLQ_forw->ITLQ_back; + + if(slot->ITLQ_back == NULL) + forw = &hostdata->ITLQ_Hash_forw[hash]; + else + forw = &slot->ITLQ_back->ITLQ_forw; + + *forw = slot->ITLQ_forw; + *back = slot->ITLQ_back; + } + hash = hash_ITL(slot->cmnd->target, slot->cmnd->lun); + if(slot->ITL_forw == NULL) + back = &hostdata->ITL_Hash_back[hash]; + else + back = &slot->ITL_forw->ITL_back; + + if(slot->ITL_back == NULL) + forw = &hostdata->ITL_Hash_forw[hash]; + else + forw = &slot->ITL_back->ITL_forw; + + *forw = slot->ITL_forw; + *back = slot->ITL_back; + + slot->resume_offset = 0; + slot->cmnd = NULL; + slot->state = NCR_700_SLOT_FREE; + slot->ITL_forw = hostdata->free_list; + hostdata->free_list = slot; + hostdata->command_slot_count--; +} + + +/* This routine really does very little. The command is indexed on + the ITL and (if tagged) the ITLQ lists in _queuecommand */ +STATIC void +save_for_reselection(struct NCR_700_Host_Parameters *hostdata, + Scsi_Cmnd *SCp, __u32 dsp) +{ + /* Its just possible that this gets executed twice */ + if(SCp != NULL) { + struct NCR_700_command_slot *slot = + (struct NCR_700_command_slot *)SCp->host_scribble; + + slot->resume_offset = dsp; + } + hostdata->state = NCR_700_HOST_FREE; + hostdata->cmd = NULL; +} + +/* Most likely nexus is the oldest in each case */ +STATIC inline struct NCR_700_command_slot * +find_ITL_Nexus(struct NCR_700_Host_Parameters *hostdata, __u8 pun, __u8 lun) +{ + int hash = hash_ITL(pun, lun); + struct NCR_700_command_slot *slot = hostdata->ITL_Hash_back[hash]; + while(slot != NULL && !(slot->cmnd->target == pun && + slot->cmnd->lun == lun)) + slot = slot->ITL_back; + return slot; +} + +STATIC inline struct NCR_700_command_slot * +find_ITLQ_Nexus(struct NCR_700_Host_Parameters *hostdata, __u8 pun, + __u8 lun, __u8 tag) +{ + int hash = hash_ITLQ(pun, lun, tag); + struct NCR_700_command_slot *slot = hostdata->ITLQ_Hash_back[hash]; + + while(slot != NULL && !(slot->cmnd->target == pun + && slot->cmnd->lun == lun && slot->tag == tag)) + slot = slot->ITLQ_back; + +#ifdef NCR_700_TAG_DEBUG + if(slot != NULL) { + struct NCR_700_command_slot *n = slot->ITLQ_back; + while(n != NULL && n->cmnd->target != pun + && n->cmnd->lun != lun && n->tag != tag) + n = n->ITLQ_back; + + if(n != NULL && n->cmnd->target == pun && n->cmnd->lun == lun + && n->tag == tag) { + printk(KERN_WARNING "53c700: WARNING: DUPLICATE tag %d\n", + tag); + } + } +#endif + return slot; +} + + + +/* This translates the SDTR message offset and period to a value + * which can be loaded into the SXFER_REG. + * + * NOTE: According to SCSI-2, the true transfer period (in ns) is + * actually four times this period value */ +STATIC inline __u8 +NCR_700_offset_period_to_sxfer(struct NCR_700_Host_Parameters *hostdata, + __u8 offset, __u8 period) +{ + int XFERP; + + if(period*4 < NCR_700_MIN_PERIOD) { + printk(KERN_WARNING "53c700: Period %dns is less than SCSI-2 minimum, setting to %d\n", period*4, NCR_700_MIN_PERIOD); + period = NCR_700_MIN_PERIOD/4; + } + XFERP = (period*4 * hostdata->sync_clock)/1000 - 4; + if(offset > NCR_700_MAX_OFFSET) { + printk(KERN_WARNING "53c700: Offset %d exceeds maximum, setting to %d\n", + offset, NCR_700_MAX_OFFSET); + offset = NCR_700_MAX_OFFSET; + } + if(XFERP < NCR_700_MIN_XFERP) { + printk(KERN_WARNING "53c700: XFERP %d is less than minium, setting to %d\n", + XFERP, NCR_700_MIN_XFERP); + XFERP = NCR_700_MIN_XFERP; + } + return (offset & 0x0f) | (XFERP & 0x07)<<4; +} + + +STATIC inline void +NCR_700_scsi_done(struct NCR_700_Host_Parameters *hostdata, + Scsi_Cmnd *SCp, int result) +{ + hostdata->state = NCR_700_HOST_FREE; + hostdata->cmd = NULL; + + if(SCp != NULL) { + struct NCR_700_command_slot *slot = + (struct NCR_700_command_slot *)SCp->host_scribble; + + if(SCp->cmnd[0] == REQUEST_SENSE && SCp->cmnd[6] == NCR_700_INTERNAL_SENSE_MAGIC) { +#ifdef NCR_700_DEBUG + printk(" ORIGINAL CMD %p RETURNED %d, new return is %d sense is", + SCp, SCp->cmnd[7], result); + print_sense("53c700", SCp); +#endif + if(result == 0) + result = SCp->cmnd[7]; + } + + free_slot(slot, hostdata); + + SCp->host_scribble = NULL; + SCp->result = result; + SCp->scsi_done(SCp); + if(NCR_700_get_depth(SCp->device) == 0 || + NCR_700_get_depth(SCp->device) > NCR_700_MAX_TAGS) + printk(KERN_ERR "Invalid depth in NCR_700_scsi_done(): %d\n", + NCR_700_get_depth(SCp->device)); + NCR_700_set_depth(SCp->device, NCR_700_get_depth(SCp->device) - 1); + } else { + printk(KERN_ERR "53c700: SCSI DONE HAS NULL SCp\n"); + } +} + + +STATIC void +NCR_700_internal_bus_reset(struct Scsi_Host *host) +{ + /* Bus reset */ + NCR_700_writeb(ASSERT_RST, host, SCNTL1_REG); + udelay(50); + NCR_700_writeb(0, host, SCNTL1_REG); + +} + +STATIC void +NCR_700_chip_setup(struct Scsi_Host *host) +{ + struct NCR_700_Host_Parameters *hostdata = + (struct NCR_700_Host_Parameters *)host->hostdata[0]; + + NCR_700_writeb(1 << host->this_id, host, SCID_REG); + NCR_700_writeb(0, host, SBCL_REG); + NCR_700_writeb(0, host, SXFER_REG); + + NCR_700_writeb(PHASE_MM_INT | SEL_TIMEOUT_INT | GROSS_ERR_INT | UX_DISC_INT + | RST_INT | PAR_ERR_INT | SELECT_INT, host, SIEN_REG); + + NCR_700_writeb(ABORT_INT | INT_INST_INT | ILGL_INST_INT, host, DIEN_REG); + NCR_700_writeb(BURST_LENGTH_8, host, DMODE_REG); + NCR_700_writeb(FULL_ARBITRATION | PARITY | AUTO_ATN, host, SCNTL0_REG); + NCR_700_writeb(LAST_DIS_ENBL | ENABLE_ACTIVE_NEGATION|GENERATE_RECEIVE_PARITY, + host, CTEST8_REG); + NCR_700_writeb(ENABLE_SELECT, host, SCNTL1_REG); + if(hostdata->clock > 75) { + printk(KERN_ERR "53c700: Clock speed %dMHz is too high: 75Mhz is the maximum this chip can be driven at\n", hostdata->clock); + /* do the best we can, but the async clock will be out + * of spec: sync divider 2, async divider 3 */ + DEBUG(("53c700: sync 2 async 3\n")); + NCR_700_writeb(SYNC_DIV_2_0, host, SBCL_REG); + NCR_700_writeb(ASYNC_DIV_3_0, host, DCNTL_REG); + hostdata->sync_clock = hostdata->clock/2; + } else if(hostdata->clock > 50 && hostdata->clock <= 75) { + /* sync divider 1.5, async divider 3 */ + DEBUG(("53c700: sync 1.5 async 3\n")); + NCR_700_writeb(SYNC_DIV_1_5, host, SBCL_REG); + NCR_700_writeb(ASYNC_DIV_3_0, host, DCNTL_REG); + hostdata->sync_clock = hostdata->clock*2; + hostdata->sync_clock /= 3; + + } else if(hostdata->clock > 37 && hostdata->clock <= 50) { + /* sync divider 1, async divider 2 */ + DEBUG(("53c700: sync 1 async 2\n")); + NCR_700_writeb(SYNC_DIV_1_0, host, SBCL_REG); + NCR_700_writeb(ASYNC_DIV_2_0, host, DCNTL_REG); + hostdata->sync_clock = hostdata->clock; + } else if(hostdata->clock > 25 && hostdata->clock <=37) { + /* sync divider 1, async divider 1.5 */ + DEBUG(("53c700: sync 1 async 1.5\n")); + NCR_700_writeb(SYNC_DIV_1_0, host, SBCL_REG); + NCR_700_writeb(ASYNC_DIV_1_5, host, DCNTL_REG); + hostdata->sync_clock = hostdata->clock; + } else { + DEBUG(("53c700: sync 1 async 1\n")); + NCR_700_writeb(SYNC_DIV_1_0, host, SBCL_REG); + NCR_700_writeb(ASYNC_DIV_1_0, host, DCNTL_REG); + /* sync divider 1, async divider 1 */ + } +} + +STATIC void +NCR_700_chip_reset(struct Scsi_Host *host) +{ + /* Chip reset */ + NCR_700_writeb(SOFTWARE_RESET, host, DCNTL_REG); + udelay(100); + + NCR_700_writeb(0, host, DCNTL_REG); + + mdelay(1000); + + NCR_700_chip_setup(host); +} + +/* The heart of the message processing engine is that the instruction + * immediately after the INT is the normal case (and so must be CLEAR + * ACK). If we want to do something else, we call that routine in + * scripts and set temp to be the normal case + 8 (skipping the CLEAR + * ACK) so that the routine returns correctly to resume its activity + * */ +STATIC __u32 +process_extended_message(struct Scsi_Host *host, + struct NCR_700_Host_Parameters *hostdata, + Scsi_Cmnd *SCp, __u32 dsp, __u32 dsps) +{ + __u32 resume_offset = dsp, temp = dsp + 8; + __u8 pun = 0xff, lun = 0xff; + + if(SCp != NULL) { + pun = SCp->target; + lun = SCp->lun; + } + + switch(hostdata->msgin[2]) { + case A_SDTR_MSG: + if(SCp != NULL && NCR_700_is_flag_set(SCp->device, NCR_700_DEV_BEGIN_SYNC_NEGOTIATION)) { + __u8 period = hostdata->msgin[3]; + __u8 offset = hostdata->msgin[4]; + __u8 sxfer; + + if(offset != 0 && period != 0) + sxfer = NCR_700_offset_period_to_sxfer(hostdata, offset, period); + else + sxfer = 0; + + if(sxfer != NCR_700_get_SXFER(SCp->device)) { + printk(KERN_INFO "scsi%d: (%d:%d) Synchronous at offset %d, period %dns\n", + host->host_no, pun, lun, + offset, period*4); + + NCR_700_set_SXFER(SCp->device, sxfer); + } + + + NCR_700_set_flag(SCp->device, NCR_700_DEV_NEGOTIATED_SYNC); + NCR_700_clear_flag(SCp->device, NCR_700_DEV_BEGIN_SYNC_NEGOTIATION); + + NCR_700_writeb(NCR_700_get_SXFER(SCp->device), + host, SXFER_REG); + + } else { + /* SDTR message out of the blue, reject it */ + printk(KERN_WARNING "scsi%d Unexpected SDTR msg\n", + host->host_no); + hostdata->msgout[0] = A_REJECT_MSG; + dma_cache_wback((unsigned long)hostdata->msgout, sizeof(hostdata->msgout)); + script_patch_16(hostdata->script, MessageCount, 1); + /* SendMsgOut returns, so set up the return + * address */ + resume_offset = hostdata->pScript + Ent_SendMessageWithATN; + } + break; + + case A_WDTR_MSG: + printk(KERN_INFO "scsi%d: (%d:%d), Unsolicited WDTR after CMD, Rejecting\n", + host->host_no, pun, lun); + hostdata->msgout[0] = A_REJECT_MSG; + dma_cache_wback((unsigned long)hostdata->msgout, sizeof(hostdata->msgout)); + script_patch_16(hostdata->script, MessageCount, 1); + resume_offset = hostdata->pScript + Ent_SendMessageWithATN; + + break; + + default: + printk(KERN_INFO "scsi%d (%d:%d): Unexpected message %s: ", + host->host_no, pun, lun, + NCR_700_phase[(dsps & 0xf00) >> 8]); + print_msg(hostdata->msgin); + printk("\n"); + /* just reject it */ + hostdata->msgout[0] = A_REJECT_MSG; + dma_cache_wback((unsigned long)hostdata->msgout, sizeof(hostdata->msgout)); + script_patch_16(hostdata->script, MessageCount, 1); + /* SendMsgOut returns, so set up the return + * address */ + resume_offset = hostdata->pScript + Ent_SendMessageWithATN; + } + NCR_700_writel(temp, host, TEMP_REG); + return resume_offset; +} + +STATIC __u32 +process_message(struct Scsi_Host *host, struct NCR_700_Host_Parameters *hostdata, + Scsi_Cmnd *SCp, __u32 dsp, __u32 dsps) +{ + /* work out where to return to */ + __u32 temp = dsp + 8, resume_offset = dsp; + __u8 pun = 0xff, lun = 0xff; + + dma_cache_inv((unsigned long)hostdata->msgin, sizeof(hostdata->msgin)); + + if(SCp != NULL) { + pun = SCp->target; + lun = SCp->lun; + } + +#ifdef NCR_700_DEBUG + printk("scsi%d (%d:%d): message %s: ", host->host_no, pun, lun, + NCR_700_phase[(dsps & 0xf00) >> 8]); + print_msg(hostdata->msgin); + printk("\n"); +#endif + + switch(hostdata->msgin[0]) { + + case A_EXTENDED_MSG: + return process_extended_message(host, hostdata, SCp, + dsp, dsps); + + case A_REJECT_MSG: + if(SCp != NULL && NCR_700_is_flag_set(SCp->device, NCR_700_DEV_BEGIN_SYNC_NEGOTIATION)) { + /* Rejected our sync negotiation attempt */ + NCR_700_set_SXFER(SCp->device, 0); + NCR_700_set_flag(SCp->device, NCR_700_DEV_NEGOTIATED_SYNC); + NCR_700_clear_flag(SCp->device, NCR_700_DEV_BEGIN_SYNC_NEGOTIATION); + } else if(SCp != NULL && NCR_700_is_flag_set(SCp->device, NCR_700_DEV_BEGIN_TAG_QUEUEING)) { + /* rejected our first simple tag message */ + printk(KERN_WARNING "scsi%d (%d:%d) Rejected first tag queue attempt, turning off tag queueing\n", host->host_no, pun, lun); + NCR_700_clear_flag(SCp->device, NCR_700_DEV_BEGIN_TAG_QUEUEING); + hostdata->tag_negotiated &= ~(1<target); + } else { + printk(KERN_WARNING "scsi%d (%d:%d) Unexpected REJECT Message %s\n", + host->host_no, pun, lun, + NCR_700_phase[(dsps & 0xf00) >> 8]); + /* however, just ignore it */ + } + break; + + case A_PARITY_ERROR_MSG: + printk(KERN_ERR "scsi%d (%d:%d) Parity Error!\n", host->host_no, + pun, lun); + NCR_700_internal_bus_reset(host); + break; + case A_SIMPLE_TAG_MSG: + printk(KERN_INFO "scsi%d (%d:%d) SIMPLE TAG %d %s\n", host->host_no, + pun, lun, hostdata->msgin[1], + NCR_700_phase[(dsps & 0xf00) >> 8]); + /* just ignore it */ + break; + default: + printk(KERN_INFO "scsi%d (%d:%d): Unexpected message %s: ", + host->host_no, pun, lun, + NCR_700_phase[(dsps & 0xf00) >> 8]); + + print_msg(hostdata->msgin); + printk("\n"); + /* just reject it */ + hostdata->msgout[0] = A_REJECT_MSG; + dma_cache_wback((unsigned long)hostdata->msgout, sizeof(hostdata->msgout)); + script_patch_16(hostdata->script, MessageCount, 1); + /* SendMsgOut returns, so set up the return + * address */ + resume_offset = hostdata->pScript + Ent_SendMessageWithATN; + + break; + } + NCR_700_writel(temp, host, TEMP_REG); + return resume_offset; +} + +STATIC __u32 +process_script_interrupt(__u32 dsps, __u32 dsp, Scsi_Cmnd *SCp, + struct Scsi_Host *host, + struct NCR_700_Host_Parameters *hostdata) +{ + __u32 resume_offset = 0; + __u8 pun = 0xff, lun=0xff; + + if(SCp != NULL) { + pun = SCp->target; + lun = SCp->lun; + } + + if(dsps == A_GOOD_STATUS_AFTER_STATUS) { + dma_cache_inv((unsigned long)hostdata->status, sizeof(hostdata->status)); + DEBUG((" COMMAND COMPLETE, status=%02x\n", + hostdata->status)); + /* OK, if TCQ still on, we know it works */ + NCR_700_clear_flag(SCp->device, NCR_700_DEV_BEGIN_TAG_QUEUEING); + /* check for contingent allegiance contitions */ + if(status_byte(hostdata->status) == CHECK_CONDITION || + status_byte(hostdata->status) == COMMAND_TERMINATED) { + struct NCR_700_command_slot *slot = + (struct NCR_700_command_slot *)SCp->host_scribble; + if(SCp->cmnd[0] == REQUEST_SENSE) { + /* OOPS: bad device, returning another + * contingent allegiance condition */ + printk(KERN_ERR "scsi%d (%d:%d) broken device is looping in contingent allegiance: ignoring\n", host->host_no, pun, lun); + NCR_700_scsi_done(hostdata, SCp, hostdata->status); + } else { + + DEBUG((" cmd %p has status %d, requesting sense\n", + SCp, hostdata->status)); + /* we can destroy the command here because the + * contingent allegiance condition will cause a + * retry which will re-copy the command from the + * saved data_cmnd */ + SCp->cmnd[0] = REQUEST_SENSE; + SCp->cmnd[1] = (SCp->lun & 0x7) << 5; + SCp->cmnd[2] = 0; + SCp->cmnd[3] = 0; + SCp->cmnd[4] = sizeof(SCp->sense_buffer); + SCp->cmnd[5] = 0; + SCp->cmd_len = 6; + /* Here's a quiet hack: the REQUEST_SENSE command is + * six bytes, so store a flag indicating that this + * was an internal sense request and the original + * status at the end of the command */ + SCp->cmnd[6] = NCR_700_INTERNAL_SENSE_MAGIC; + SCp->cmnd[7] = hostdata->status; + slot->SG[0].ins = bS_to_host(SCRIPT_MOVE_DATA_IN | sizeof(SCp->sense_buffer)); + slot->SG[0].pAddr = bS_to_host(virt_to_bus(SCp->sense_buffer)); + slot->SG[1].ins = bS_to_host(SCRIPT_RETURN); + slot->SG[1].pAddr = 0; + slot->resume_offset = hostdata->pScript; + dma_cache_wback((unsigned long)slot->SG, sizeof(slot->SG[0])*2); + dma_cache_inv((unsigned long)SCp->sense_buffer, sizeof(SCp->sense_buffer)); + + /* queue the command for reissue */ + slot->state = NCR_700_SLOT_QUEUED; + hostdata->state = NCR_700_HOST_FREE; + hostdata->cmd = NULL; + } + } else { + if(status_byte(hostdata->status) == GOOD && + SCp->cmnd[0] == INQUIRY && SCp->use_sg == 0) { + /* Piggy back the tag queueing support + * on this command */ + if(((char *)SCp->request_buffer)[7] & 0x02) { + printk(KERN_INFO "scsi%d: (%d:%d) Enabling Tag Command Queuing\n", host->host_no, pun, lun); + hostdata->tag_negotiated |= (1<target); + NCR_700_set_flag(SCp->device, NCR_700_DEV_BEGIN_TAG_QUEUEING); + } else { + NCR_700_clear_flag(SCp->device, NCR_700_DEV_BEGIN_TAG_QUEUEING); + hostdata->tag_negotiated &= ~(1<target); + } + } + NCR_700_scsi_done(hostdata, SCp, hostdata->status); + } + } else if((dsps & 0xfffff0f0) == A_UNEXPECTED_PHASE) { + __u8 i = (dsps & 0xf00) >> 8; + + printk(KERN_ERR "scsi%d: (%d:%d), UNEXPECTED PHASE %s (%s)\n", + host->host_no, pun, lun, + NCR_700_phase[i], + sbcl_to_string(NCR_700_readb(host, SBCL_REG))); + printk(KERN_ERR " len = %d, cmd =", SCp->cmd_len); + print_command(SCp->cmnd); + + NCR_700_internal_bus_reset(host); + } else if((dsps & 0xfffff000) == A_FATAL) { + int i = (dsps & 0xfff); + + printk(KERN_ERR "scsi%d: (%d:%d) FATAL ERROR: %s\n", + host->host_no, pun, lun, NCR_700_fatal_messages[i]); + if(dsps == A_FATAL_ILLEGAL_MSG_LENGTH) { + printk(KERN_ERR " msg begins %02x %02x\n", + hostdata->msgin[0], hostdata->msgin[1]); + } + NCR_700_internal_bus_reset(host); + } else if((dsps & 0xfffff0f0) == A_DISCONNECT) { +#ifdef NCR_700_DEBUG + __u8 i = (dsps & 0xf00) >> 8; + + printk("scsi%d: (%d:%d), DISCONNECTED (%d) %s\n", + host->host_no, pun, lun, + i, NCR_700_phase[i]); +#endif + save_for_reselection(hostdata, SCp, dsp); + + } else if(dsps == A_RESELECTION_IDENTIFIED) { + __u8 lun; + struct NCR_700_command_slot *slot; + __u8 reselection_id = hostdata->reselection_id; + + dma_cache_inv((unsigned long)hostdata->msgin, sizeof(hostdata->msgin)); + + lun = hostdata->msgin[0] & 0x1f; + + hostdata->reselection_id = 0xff; + DEBUG(("scsi%d: (%d:%d) RESELECTED!\n", + host->host_no, reselection_id, lun)); + /* clear the reselection indicator */ + if(hostdata->msgin[1] == A_SIMPLE_TAG_MSG) { + slot = find_ITLQ_Nexus(hostdata, reselection_id, + lun, hostdata->msgin[2]); + } else { + slot = find_ITL_Nexus(hostdata, reselection_id, lun); + } + retry: + if(slot == NULL) { + struct NCR_700_command_slot *s = find_ITL_Nexus(hostdata, reselection_id, lun); + printk(KERN_ERR "scsi%d: (%d:%d) RESELECTED but no saved command (MSG = %02x %02x %02x)!!\n", + host->host_no, reselection_id, lun, + hostdata->msgin[0], hostdata->msgin[1], + hostdata->msgin[2]); + printk(KERN_ERR " OUTSTANDING TAGS:"); + while(s != NULL) { + if(s->cmnd->target == reselection_id && + s->cmnd->lun == lun) { + printk("%d ", s->tag); + if(s->tag == hostdata->msgin[2]) { + printk(" ***FOUND*** \n"); + slot = s; + goto retry; + } + + } + s = s->ITL_back; + } + printk("\n"); + } else { + if(hostdata->state != NCR_700_HOST_BUSY) + printk(KERN_ERR "scsi%d: FATAL, host not busy during valid reselection!\n", + host->host_no); + resume_offset = slot->resume_offset; + hostdata->cmd = slot->cmnd; + + /* re-patch for this command */ + script_patch_32_abs(hostdata->script, CommandAddress, + virt_to_bus(slot->cmnd->cmnd)); + script_patch_16(hostdata->script, + CommandCount, slot->cmnd->cmd_len); + script_patch_32_abs(hostdata->script, SGScriptStartAddress, + virt_to_bus(&slot->SG[0].ins)); + + /* Note: setting SXFER only works if we're + * still in the MESSAGE phase, so it is vital + * that ACK is still asserted when we process + * the reselection message. The resume offset + * should therefore always clear ACK */ + NCR_700_writeb(NCR_700_get_SXFER(hostdata->cmd->device), + host, SXFER_REG); + + } + } else if(dsps == A_RESELECTED_DURING_SELECTION) { + + /* This section is full of debugging code because I've + * never managed to reach it. I think what happens is + * that, because the 700 runs with selection + * interrupts enabled the whole time that we take a + * selection interrupt before we manage to get to the + * reselected script interrupt */ + + __u8 reselection_id = NCR_700_readb(host, SFBR_REG); + struct NCR_700_command_slot *slot; + + /* Take out our own ID */ + reselection_id &= ~(1<this_id); + + printk(KERN_INFO "scsi%d: (%d:%d) RESELECTION DURING SELECTION, dsp=%p[%04x] state=%d, count=%d\n", + host->host_no, reselection_id, lun, (void *)dsp, dsp - hostdata->pScript, hostdata->state, hostdata->command_slot_count); + + { + /* FIXME: DEBUGGING CODE */ + __u32 SG = (__u32)bus_to_virt(hostdata->script[A_SGScriptStartAddress_used[0]]); + int i; + + for(i=0; i< NCR_700_COMMAND_SLOTS_PER_HOST; i++) { + if(SG >= (__u32)(&hostdata->slots[i].SG[0]) + && SG <= (__u32)(&hostdata->slots[i].SG[NCR_700_SG_SEGMENTS])) + break; + } + printk(KERN_INFO "IDENTIFIED SG segment as being %p in slot %p, cmd %p, slot->resume_offset=%p\n", (void *)SG, &hostdata->slots[i], hostdata->slots[i].cmnd, (void *)hostdata->slots[i].resume_offset); + SCp = hostdata->slots[i].cmnd; + } + + if(SCp != NULL) { + slot = (struct NCR_700_command_slot *)SCp->host_scribble; + /* change slot from busy to queued to redo command */ + slot->state = NCR_700_SLOT_QUEUED; + } + hostdata->cmd = NULL; + + if(reselection_id == 0) { + if(hostdata->reselection_id == 0xff) { + printk(KERN_ERR "scsi%d: Invalid reselection during selection!!\n", host->host_no); + return 0; + } else { + printk(KERN_ERR "scsi%d: script reselected and we took a selection interrupt\n", + host->host_no); + reselection_id = hostdata->reselection_id; + } + } else { + + /* convert to real ID */ + reselection_id = bitmap_to_number(reselection_id); + } + hostdata->reselection_id = reselection_id; + hostdata->msgin[1] = 0; + dma_cache_wback((unsigned long)hostdata->msgin, sizeof(hostdata->msgin)); + if(hostdata->tag_negotiated & (1<pScript + Ent_GetReselectionWithTag; + } else { + resume_offset = hostdata->pScript + Ent_GetReselectionData; + } + } else if(dsps == A_COMPLETED_SELECTION_AS_TARGET) { + /* we've just disconnected from the bus, do nothing since + * a return here will re-run the queued command slot + * that may have been interrupted by the initial selection */ + DEBUG((" SELECTION COMPLETED\n")); + } else if((dsps & 0xfffff0f0) == A_MSG_IN) { + resume_offset = process_message(host, hostdata, SCp, + dsp, dsps); + } else if((dsps & 0xfffff000) == 0) { + __u8 i = (dsps & 0xf0) >> 4, j = (dsps & 0xf00) >> 8; + printk(KERN_ERR "scsi%d: (%d:%d), unhandled script condition %s %s at %04x\n", + host->host_no, pun, lun, NCR_700_condition[i], + NCR_700_phase[j], dsp - hostdata->pScript); + if(SCp != NULL) { + print_command(SCp->cmnd); + + if(SCp->use_sg) { + for(i = 0; i < SCp->use_sg + 1; i++) { + printk(KERN_INFO " SG[%d].length = %d, move_insn=%08x, addr %08x\n", i, ((struct scatterlist *)SCp->buffer)[i].length, ((struct NCR_700_command_slot *)SCp->host_scribble)->SG[i].ins, ((struct NCR_700_command_slot *)SCp->host_scribble)->SG[i].pAddr); + } + } + } + NCR_700_internal_bus_reset(host); + } else if((dsps & 0xfffff000) == A_DEBUG_INTERRUPT) { + printk(KERN_NOTICE "scsi%d (%d:%d) DEBUG INTERRUPT %d AT %p[%04x], continuing\n", + host->host_no, pun, lun, dsps & 0xfff, (void *)dsp, dsp - hostdata->pScript); + resume_offset = dsp; + } else { + printk(KERN_ERR "scsi%d: (%d:%d), unidentified script interrupt 0x%x at %04x\n", + host->host_no, pun, lun, dsps, dsp - hostdata->pScript); + NCR_700_internal_bus_reset(host); + } + return resume_offset; +} + +/* We run the 53c700 with selection interrupts always enabled. This + * means that the chip may be selected as soon as the bus frees. On a + * busy bus, this can be before the scripts engine finishes its + * processing. Therefore, part of the selection processing has to be + * to find out what the scripts engine is doing and complete the + * function if necessary (i.e. process the pending disconnect or save + * the interrupted initial selection */ +STATIC inline __u32 +process_selection(struct Scsi_Host *host, __u32 dsp) +{ + __u8 id = 0; /* Squash compiler warning */ + int count = 0; + __u32 resume_offset = 0; + struct NCR_700_Host_Parameters *hostdata = + (struct NCR_700_Host_Parameters *)host->hostdata[0]; + Scsi_Cmnd *SCp = hostdata->cmd; + __u8 sbcl; + + for(count = 0; count < 5; count++) { + id = NCR_700_readb(host, SFBR_REG); + + /* Take out our own ID */ + id &= ~(1<this_id); + if(id != 0) + break; + udelay(5); + } + sbcl = NCR_700_readb(host, SBCL_REG); + if((sbcl & SBCL_IO) == 0) { + /* mark as having been selected rather than reselected */ + id = 0xff; + } else { + /* convert to real ID */ + hostdata->reselection_id = id = bitmap_to_number(id); + DEBUG(("scsi%d: Reselected by %d\n", + host->host_no, id)); + } + if(hostdata->state == NCR_700_HOST_BUSY && SCp != NULL) { + struct NCR_700_command_slot *slot = + (struct NCR_700_command_slot *)SCp->host_scribble; + DEBUG((" ID %d WARNING: RESELECTION OF BUSY HOST, saving cmd %p, slot %p, addr %x [%04x], resume %x!\n", id, hostdata->cmd, slot, dsp, dsp - hostdata->pScript, resume_offset)); + + switch(dsp - hostdata->pScript) { + case Ent_Disconnect1: + case Ent_Disconnect2: + save_for_reselection(hostdata, SCp, Ent_Disconnect2 + hostdata->pScript); + break; + case Ent_Disconnect3: + case Ent_Disconnect4: + save_for_reselection(hostdata, SCp, Ent_Disconnect4 + hostdata->pScript); + break; + case Ent_Disconnect5: + case Ent_Disconnect6: + save_for_reselection(hostdata, SCp, Ent_Disconnect6 + hostdata->pScript); + break; + case Ent_Disconnect7: + case Ent_Disconnect8: + save_for_reselection(hostdata, SCp, Ent_Disconnect8 + hostdata->pScript); + break; + case Ent_Finish1: + case Ent_Finish2: + process_script_interrupt(A_GOOD_STATUS_AFTER_STATUS, dsp, SCp, host, hostdata); + break; + + default: + slot->state = NCR_700_SLOT_QUEUED; + break; + } + } + hostdata->state = NCR_700_HOST_BUSY; + hostdata->cmd = NULL; + hostdata->msgin[1] = 0; + dma_cache_wback((unsigned long)hostdata->msgin, sizeof(hostdata->msgin)); + + if(id == 0xff) { + /* Selected as target, Ignore */ + resume_offset = hostdata->pScript + Ent_SelectedAsTarget; + } else if(hostdata->tag_negotiated & (1<pScript + Ent_GetReselectionWithTag; + } else { + resume_offset = hostdata->pScript + Ent_GetReselectionData; + } + return resume_offset; +} + + +STATIC int +NCR_700_start_command(Scsi_Cmnd *SCp) +{ + struct NCR_700_command_slot *slot = + (struct NCR_700_command_slot *)SCp->host_scribble; + struct NCR_700_Host_Parameters *hostdata = + (struct NCR_700_Host_Parameters *)SCp->host->hostdata[0]; + unsigned long flags; + __u16 count = 1; /* for IDENTIFY message */ + + save_flags(flags); + cli(); + if(hostdata->state != NCR_700_HOST_FREE) { + /* keep this inside the lock to close the race window where + * the running command finishes on another CPU while we don't + * change the state to queued on this one */ + slot->state = NCR_700_SLOT_QUEUED; + restore_flags(flags); + + DEBUG(("scsi%d: host busy, queueing command %p, slot %p\n", + SCp->host->host_no, slot->cmnd, slot)); + return 0; + } + hostdata->state = NCR_700_HOST_BUSY; + hostdata->cmd = SCp; + slot->state = NCR_700_SLOT_BUSY; + /* keep interrupts disabled until we have the command correctly + * set up so we cannot take a selection interrupt */ + + hostdata->msgout[0] = NCR_700_identify(SCp->cmnd[0] != REQUEST_SENSE, + SCp->lun); + /* for INQUIRY or REQUEST_SENSE commands, we cannot be sure + * if the negotiated transfer parameters still hold, so + * always renegotiate them */ + if(SCp->cmnd[0] == INQUIRY || SCp->cmnd[0] == REQUEST_SENSE) { + NCR_700_clear_flag(SCp->device, NCR_700_DEV_NEGOTIATED_SYNC); + } + + /* REQUEST_SENSE is asking for contingent I_T_L status. If a + * contingent allegiance condition exists, the device will + * refuse all tags, so send the request sense as untagged */ + if((hostdata->tag_negotiated & (1<target)) + && (slot->tag != NCR_700_NO_TAG && SCp->cmnd[0] != REQUEST_SENSE)) { + hostdata->msgout[count++] = A_SIMPLE_TAG_MSG; + hostdata->msgout[count++] = slot->tag; + } + + if(hostdata->fast && + NCR_700_is_flag_clear(SCp->device, NCR_700_DEV_NEGOTIATED_SYNC)) { + memcpy(&hostdata->msgout[count], NCR_700_SDTR_msg, + sizeof(NCR_700_SDTR_msg)); + count += sizeof(NCR_700_SDTR_msg); + NCR_700_set_flag(SCp->device, NCR_700_DEV_BEGIN_SYNC_NEGOTIATION); + } + + dma_cache_wback((unsigned long)hostdata->msgout, count); + + script_patch_16(hostdata->script, MessageCount, count); + + + script_patch_ID(hostdata->script, + Device_ID, 1<target); + + script_patch_32_abs(hostdata->script, CommandAddress, + virt_to_bus(SCp->cmnd)); + script_patch_16(hostdata->script, CommandCount, SCp->cmd_len); + /* finally plumb the beginning of the SG list into the script + * */ + script_patch_32_abs(hostdata->script, SGScriptStartAddress, + virt_to_bus(&slot->SG[0].ins)); + NCR_700_writeb(CLR_FIFO, SCp->host, DFIFO_REG); + + /* set the synchronous period/offset */ + if(slot->resume_offset == 0) + slot->resume_offset = hostdata->pScript; + NCR_700_writeb(NCR_700_get_SXFER(SCp->device), + SCp->host, SXFER_REG); + /* allow interrupts here so that if we're selected we can take + * a selection interrupt. The script start may not be + * effective in this case, but the selection interrupt will + * save our command in that case */ + NCR_700_writel(slot->temp, SCp->host, TEMP_REG); + NCR_700_writel(slot->resume_offset, SCp->host, DSP_REG); + restore_flags(flags); + + return 1; +} + +void +NCR_700_intr(int irq, void *dev_id, struct pt_regs *regs) +{ + struct Scsi_Host *host = (struct Scsi_Host *)dev_id; + struct NCR_700_Host_Parameters *hostdata = + (struct NCR_700_Host_Parameters *)host->hostdata[0]; + __u8 istat; + __u32 resume_offset = 0; + __u8 pun = 0xff, lun = 0xff; + unsigned long flags; + + /* Unfortunately, we have to take the io_request_lock here + * rather than the host lock hostdata->lock because we're + * looking to exclude queuecommand from messing with the + * registers while we're processing the interrupt. Since + * queuecommand is called holding io_request_lock, and we have + * to take io_request_lock before we call the command + * scsi_done, we would get a deadlock if we took + * hostdata->lock here and in queuecommand (because the order + * of locking in queuecommand: 1) io_request_lock then 2) + * hostdata->lock would be the reverse of taking it in this + * routine */ + spin_lock_irqsave(&io_request_lock, flags); + if((istat = NCR_700_readb(host, ISTAT_REG)) + & (SCSI_INT_PENDING | DMA_INT_PENDING)) { + __u32 dsps; + __u8 sstat0 = 0, dstat = 0; + __u32 dsp; + Scsi_Cmnd *SCp = hostdata->cmd; + enum NCR_700_Host_State state; + + state = hostdata->state; + SCp = hostdata->cmd; + + if(istat & SCSI_INT_PENDING) { + udelay(10); + + sstat0 = NCR_700_readb(host, SSTAT0_REG); + } + + if(istat & DMA_INT_PENDING) { + udelay(10); + + dstat = NCR_700_readb(host, DSTAT_REG); + } + + dsps = NCR_700_readl(host, DSPS_REG); + dsp = NCR_700_readl(host, DSP_REG); + + DEBUG(("scsi%d: istat %02x sstat0 %02x dstat %02x dsp %04x[%08x] dsps 0x%x\n", + host->host_no, istat, sstat0, dstat, + (dsp - (__u32)virt_to_bus(hostdata->script))/4, + dsp, dsps)); + + if(SCp != NULL) { + pun = SCp->target; + lun = SCp->lun; + } + + if(sstat0 & SCSI_RESET_DETECTED) { + Scsi_Device *SDp; + int i; + + hostdata->state = NCR_700_HOST_BUSY; + + printk(KERN_ERR "scsi%d: Bus Reset detected, executing command %p, slot %p, dsp %p[%04x]\n", + host->host_no, SCp, SCp == NULL ? NULL : SCp->host_scribble, (void *)dsp, dsp - hostdata->pScript); + + /* clear all the negotiated parameters */ + for(SDp = host->host_queue; SDp != NULL; SDp = SDp->next) + SDp->hostdata = 0; + + /* clear all the slots and their pending commands */ + for(i = 0; i < NCR_700_COMMAND_SLOTS_PER_HOST; i++) { + Scsi_Cmnd *SCp; + struct NCR_700_command_slot *slot = + &hostdata->slots[i]; + + if(slot->state == NCR_700_SLOT_FREE) + continue; + + SCp = slot->cmnd; + printk(KERN_ERR " failing command because of reset, slot %p, cmnd %p\n", + slot, SCp); + free_slot(slot, hostdata); + SCp->host_scribble = NULL; + NCR_700_set_depth(SCp->device, 0); + /* NOTE: deadlock potential here: we + * rely on mid-layer guarantees that + * scsi_done won't try to issue the + * command again otherwise we'll + * deadlock on the + * hostdata->state_lock */ + SCp->result = DID_RESET << 16; + SCp->scsi_done(SCp); + } + mdelay(25); + NCR_700_chip_setup(host); + + hostdata->state = NCR_700_HOST_FREE; + hostdata->cmd = NULL; + goto out_unlock; + } else if(sstat0 & SELECTION_TIMEOUT) { + DEBUG(("scsi%d: (%d:%d) selection timeout\n", + host->host_no, pun, lun)); + NCR_700_scsi_done(hostdata, SCp, DID_NO_CONNECT<<16); + } else if(sstat0 & PHASE_MISMATCH) { + struct NCR_700_command_slot *slot = (SCp == NULL) ? NULL : + (struct NCR_700_command_slot *)SCp->host_scribble; + + if(dsp == Ent_SendMessage + 8 + hostdata->pScript) { + /* It wants to reply to some part of + * our message */ +#ifdef NCR_700_DEBUG + __u32 temp = NCR_700_readl(host, TEMP_REG); + int count = (hostdata->script[Ent_SendMessage/4] & 0xffffff) - ((NCR_700_readl(host, DBC_REG) & 0xffffff) + NCR_700_data_residual(host)); + printk("scsi%d (%d:%d) PHASE MISMATCH IN SEND MESSAGE %d remain, return %p[%04x], phase %s\n", host->host_no, pun, lun, count, (void *)temp, temp - hostdata->pScript, sbcl_to_string(NCR_700_readb(host, SBCL_REG))); +#endif + resume_offset = hostdata->pScript + Ent_SendMessagePhaseMismatch; + } else if(dsp >= virt_to_bus(&slot->SG[0].ins) && + dsp <= virt_to_bus(&slot->SG[NCR_700_SG_SEGMENTS].ins)) { + int data_transfer = NCR_700_readl(host, DBC_REG) & 0xffffff; + int SGcount = (dsp - virt_to_bus(&slot->SG[0].ins))/sizeof(struct NCR_700_SG_List); + int residual = NCR_700_data_residual(host); + int i; +#ifdef NCR_700_DEBUG + printk("scsi%d: (%d:%d) Expected phase mismatch in slot->SG[%d], transferred 0x%x\n", + host->host_no, pun, lun, + SGcount, data_transfer); + print_command(SCp->cmnd); + if(residual) { + printk("scsi%d: (%d:%d) Expected phase mismatch in slot->SG[%d], transferred 0x%x, residual %d\n", + host->host_no, pun, lun, + SGcount, data_transfer, residual); + } +#endif + data_transfer += residual; + + if(data_transfer != 0) { + int count; + __u32 pAddr; + + SGcount--; + + count = (bS_to_cpu(slot->SG[SGcount].ins) & 0x00ffffff); + DEBUG(("DATA TRANSFER MISMATCH, count = %d, transferred %d\n", count, count-data_transfer)); + slot->SG[SGcount].ins &= bS_to_host(0xff000000); + slot->SG[SGcount].ins |= bS_to_host(data_transfer); + pAddr = bS_to_cpu(slot->SG[SGcount].pAddr); + pAddr += (count - data_transfer); + slot->SG[SGcount].pAddr = bS_to_host(pAddr); + } + /* set the executed moves to nops */ + for(i=0; iSG[i].ins = bS_to_host(SCRIPT_NOP); + slot->SG[i].pAddr = 0; + } + dma_cache_wback((unsigned long)slot->SG, sizeof(slot->SG)); + /* and pretend we disconnected after + * the command phase */ + resume_offset = hostdata->pScript + Ent_MsgInDuringData; + } else { + __u8 sbcl = NCR_700_readb(host, SBCL_REG); + printk(KERN_ERR "scsi%d: (%d:%d) phase mismatch at %04x, phase %s\n", + host->host_no, pun, lun, dsp - hostdata->pScript, sbcl_to_string(sbcl)); + NCR_700_internal_bus_reset(host); + } + + } else if(sstat0 & SCSI_GROSS_ERROR) { + printk(KERN_ERR "scsi%d: (%d:%d) GROSS ERROR\n", + host->host_no, pun, lun); + NCR_700_scsi_done(hostdata, SCp, DID_ERROR<<16); + } else if(dstat & SCRIPT_INT_RECEIVED) { + DEBUG(("scsi%d: (%d:%d) ====>SCRIPT INTERRUPT<====\n", + host->host_no, pun, lun)); + resume_offset = process_script_interrupt(dsps, dsp, SCp, host, hostdata); + } else if(dstat & (ILGL_INST_DETECTED)) { + printk(KERN_ERR "scsi%d: (%d:%d) Illegal Instruction detected at 0x%p[0x%x]!!!\n" + " Please email James.Bottomley@HansenPartnership.com with the details\n", + host->host_no, pun, lun, + (void *)dsp, dsp - hostdata->pScript); + NCR_700_scsi_done(hostdata, SCp, DID_ERROR<<16); + } else if(dstat & (WATCH_DOG_INTERRUPT|ABORTED)) { + printk(KERN_ERR "scsi%d: (%d:%d) serious DMA problem, dstat=%02x\n", + host->host_no, pun, lun, dstat); + NCR_700_scsi_done(hostdata, SCp, DID_ERROR<<16); + } + + + /* NOTE: selection interrupt processing MUST occur + * after script interrupt processing to correctly cope + * with the case where we process a disconnect and + * then get reselected before we process the + * disconnection */ + if(sstat0 & SELECTED) { + /* FIXME: It currently takes at least FOUR + * interrupts to complete a command that + * disconnects: one for the disconnect, one + * for the reselection, one to get the + * reselection data and one to complete the + * command. If we guess the reselected + * command here and prepare it, we only need + * to get a reselection data interrupt if we + * guessed wrongly. Since the interrupt + * overhead is much greater than the command + * setup, this would be an efficient + * optimisation particularly as we probably + * only have one outstanding command on a + * target most of the time */ + + resume_offset = process_selection(host, dsp); + + } + + } + + if(resume_offset) { + if(hostdata->state != NCR_700_HOST_BUSY) { + printk(KERN_ERR "scsi%d: Driver error: resume at %p [%04x] with non busy host!\n", + host->host_no, (void *)resume_offset, resume_offset - hostdata->pScript); + hostdata->state = NCR_700_HOST_BUSY; + } + + DEBUG(("Attempting to resume at %x\n", resume_offset)); + NCR_700_writeb(CLR_FIFO, host, DFIFO_REG); + NCR_700_writel(resume_offset, host, DSP_REG); + } + /* There is probably a technical no-no about this: If we're a + * shared interrupt and we got this interrupt because the + * other device needs servicing not us, we're still going to + * check our queued commands here---of course, there shouldn't + * be any outstanding.... */ + if(hostdata->state == NCR_700_HOST_FREE) { + int i; + + for(i = 0; i < NCR_700_COMMAND_SLOTS_PER_HOST; i++) { + /* fairness: always run the queue from the last + * position we left off */ + int j = (i + hostdata->saved_slot_position) + % NCR_700_COMMAND_SLOTS_PER_HOST; + + if(hostdata->slots[j].state != NCR_700_SLOT_QUEUED) + continue; + if(NCR_700_start_command(hostdata->slots[j].cmnd)) { + DEBUG(("scsi%d: Issuing saved command slot %p, cmd %p\t\n", + host->host_no, &hostdata->slots[j], + hostdata->slots[j].cmnd)); + hostdata->saved_slot_position = j + 1; + } + + break; + } + } + out_unlock: + spin_unlock_irqrestore(&io_request_lock, flags); +} + +/* FIXME: Need to put some proc information in and plumb it + * into the scsi proc system */ +STATIC int +NCR_700_proc_directory_info(char *proc_buf, char **startp, + off_t offset, int bytes_available, + int host_no, int write) +{ + static char buf[4096]; /* 1 page should be sufficient */ + int len = 0; + struct Scsi_Host *host = scsi_hostlist; + struct NCR_700_Host_Parameters *hostdata; + Scsi_Device *SDp; + + while(host != NULL && host->host_no != host_no) + host = host->next; + + if(host == NULL) + return 0; + + if(write) { + /* FIXME: Clear internal statistics here */ + return 0; + } + hostdata = (struct NCR_700_Host_Parameters *)host->hostdata[0]; + len += sprintf(&buf[len], "Total commands outstanding: %d\n", hostdata->command_slot_count); + len += sprintf(&buf[len],"\ +Target Depth Active Next Tag\n\ +====== ===== ====== ========\n"); + for(SDp = host->host_queue; SDp != NULL; SDp = SDp->next) { + len += sprintf(&buf[len]," %2d:%2d %4d %4d %4d\n", SDp->id, SDp->lun, SDp->queue_depth, NCR_700_get_depth(SDp), SDp->current_tag); + } + if((len -= offset) <= 0) + return 0; + if(len > bytes_available) + len = bytes_available; + memcpy(proc_buf, buf + offset, len); + return len; +} + +STATIC int +NCR_700_queuecommand(Scsi_Cmnd *SCp, void (*done)(Scsi_Cmnd *)) +{ + struct NCR_700_Host_Parameters *hostdata = + (struct NCR_700_Host_Parameters *)SCp->host->hostdata[0]; + __u32 move_ins; + struct NCR_700_command_slot *slot; + int hash; + + if(hostdata->command_slot_count >= NCR_700_COMMAND_SLOTS_PER_HOST) { + /* We're over our allocation, this should never happen + * since we report the max allocation to the mid layer */ + printk(KERN_WARNING "scsi%d: Command depth has gone over queue depth\n", SCp->host->host_no); + return 1; + } + if(NCR_700_get_depth(SCp->device) != 0 && !(hostdata->tag_negotiated & (1<target))) { + DEBUG((KERN_ERR "scsi%d (%d:%d) has non zero depth %d\n", + SCp->host->host_no, SCp->target, SCp->lun, + NCR_700_get_depth(SCp->device))); + return 1; + } + if(NCR_700_get_depth(SCp->device) >= NCR_700_MAX_TAGS) { + DEBUG((KERN_ERR "scsi%d (%d:%d) has max tag depth %d\n", + SCp->host->host_no, SCp->target, SCp->lun, + NCR_700_get_depth(SCp->device))); + return 1; + } + NCR_700_set_depth(SCp->device, NCR_700_get_depth(SCp->device) + 1); + + /* begin the command here */ + /* no need to check for NULL, test for command_slot_cound above + * ensures a slot is free */ + slot = find_empty_slot(hostdata); + + slot->cmnd = SCp; + + SCp->scsi_done = done; + SCp->host_scribble = (unsigned char *)slot; + SCp->SCp.ptr = NULL; + SCp->SCp.buffer = NULL; + +#ifdef NCR_700_DEBUG + printk("53c700: scsi%d, command ", SCp->host->host_no); + print_command(SCp->cmnd); +#endif + + if(hostdata->tag_negotiated &(1<target)) { + + struct NCR_700_command_slot *old = + find_ITL_Nexus(hostdata, SCp->target, SCp->lun); +#ifdef NCR_700_TAG_DEBUG + struct NCR_700_command_slot *found; +#endif + + if(old != NULL && old->tag == SCp->device->current_tag) { + printk(KERN_WARNING "scsi%d (%d:%d) Tag clock back to current, queueing\n", SCp->host->host_no, SCp->target, SCp->lun); + return 1; + } + slot->tag = SCp->device->current_tag++; +#ifdef NCR_700_TAG_DEBUG + while((found = find_ITLQ_Nexus(hostdata, SCp->target, SCp->lun, slot->tag)) != NULL) { + printk("\n\n**ERROR** already using tag %d, but oldest is %d\n", slot->tag, (old == NULL) ? -1 : old->tag); + printk(" FOUND = %p, tag = %d, pun = %d, lun = %d\n", + found, found->tag, found->cmnd->target, found->cmnd->lun); + slot->tag = SCp->device->current_tag++; + printk(" Tag list is: "); + while(old != NULL) { + if(old->cmnd->target == SCp->target && + old->cmnd->lun == SCp->lun) + printk("%d ", old->tag); + old = old->ITL_back; + } + printk("\n\n"); + } +#endif + hash = hash_ITLQ(SCp->target, SCp->lun, slot->tag); + /* link into the ITLQ hash queues */ + slot->ITLQ_forw = hostdata->ITLQ_Hash_forw[hash]; + hostdata->ITLQ_Hash_forw[hash] = slot; +#ifdef NCR_700_TAG_DEBUG + if(slot->ITLQ_forw != NULL && slot->ITLQ_forw->ITLQ_back != NULL) { + printk(KERN_ERR "scsi%d (%d:%d) ITLQ_back is not NULL!!!!\n", SCp->host->host_no, SCp->target, SCp->lun); + } +#endif + if(slot->ITLQ_forw != NULL) + slot->ITLQ_forw->ITLQ_back = slot; + else + hostdata->ITLQ_Hash_back[hash] = slot; + slot->ITLQ_back = NULL; + } else { + slot->tag = NCR_700_NO_TAG; + } + /* link into the ITL hash queues */ + hash = hash_ITL(SCp->target, SCp->lun); + slot->ITL_forw = hostdata->ITL_Hash_forw[hash]; + hostdata->ITL_Hash_forw[hash] = slot; +#ifdef NCR_700_TAG_DEBUG + if(slot->ITL_forw != NULL && slot->ITL_forw->ITL_back != NULL) { + printk(KERN_ERR "scsi%d (%d:%d) ITL_back is not NULL!!!!\n", + SCp->host->host_no, SCp->target, SCp->lun); + } +#endif + if(slot->ITL_forw != NULL) + slot->ITL_forw->ITL_back = slot; + else + hostdata->ITL_Hash_back[hash] = slot; + slot->ITL_back = NULL; + + + /* This is f****g ridiculous; every low level HBA driver has + * to determine the direction of the commands, why isn't this + * done inside the scsi_lib !!??? */ + switch (SCp->cmnd[0]) { + case REQUEST_SENSE: + /* clear the internal sense magic */ + SCp->cmnd[6] = 0; + /* fall through */ + case INQUIRY: + case MODE_SENSE: + case READ_6: + case READ_10: + case READ_12: + case READ_CAPACITY: + case READ_BLOCK_LIMITS: + case READ_TOC: + move_ins = SCRIPT_MOVE_DATA_IN; + break; + case MODE_SELECT: + case WRITE_6: + case WRITE_10: + case WRITE_12: + move_ins = SCRIPT_MOVE_DATA_OUT; + break; + case TEST_UNIT_READY: + case ALLOW_MEDIUM_REMOVAL: + case START_STOP: + move_ins = 0; + break; + default: + /* OK, get it from the command */ + switch(SCp->sc_data_direction) { + case SCSI_DATA_UNKNOWN: + default: + printk(KERN_ERR "53c700: Unknown command for data direction "); + print_command(SCp->cmnd); + + move_ins = 0; + break; + case SCSI_DATA_NONE: + move_ins = 0; + break; + case SCSI_DATA_READ: + move_ins = SCRIPT_MOVE_DATA_IN; + break; + case SCSI_DATA_WRITE: + move_ins = SCRIPT_MOVE_DATA_OUT; + break; + } + } + + /* now build the scatter gather list */ + if(move_ins != 0) { + int i; + + for(i = 0; i < (SCp->use_sg ? SCp->use_sg : 1); i++) { + void *vPtr; + __u32 count; + + if(SCp->use_sg) { + vPtr = (((struct scatterlist *)SCp->buffer)[i].address); + count = ((struct scatterlist *)SCp->buffer)[i].length; + } else { + vPtr = SCp->request_buffer; + count = SCp->request_bufflen; + } + slot->SG[i].ins = bS_to_host(move_ins | count); + DEBUG((" scatter block %d: move %d[%08x] from 0x%lx\n", + i, count, slot->SG[i].ins, + virt_to_bus(vPtr))); + dma_cache_wback_inv((unsigned long)vPtr, count); + slot->SG[i].pAddr = bS_to_host(virt_to_bus(vPtr)); + } + slot->SG[i].ins = bS_to_host(SCRIPT_RETURN); + slot->SG[i].pAddr = 0; + dma_cache_wback((unsigned long)slot->SG, sizeof(slot->SG)); + DEBUG((" SETTING %08lx to %x\n", + virt_to_bus(&slot->SG[i].ins), + slot->SG[i].ins)); + } + slot->resume_offset = 0; + NCR_700_start_command(SCp); + return 0; +} + +STATIC int +NCR_700_abort(Scsi_Cmnd * SCp) +{ + struct NCR_700_command_slot *slot; + struct NCR_700_Host_Parameters *hostdata = + (struct NCR_700_Host_Parameters *)SCp->host->hostdata[0]; + + printk(KERN_INFO "scsi%d (%d:%d) New error handler wants to abort command\n\t", + SCp->host->host_no, SCp->target, SCp->lun); + print_command(SCp->cmnd); + + slot = find_ITL_Nexus(hostdata, SCp->target, SCp->lun); + while(slot != NULL && slot->cmnd != SCp) + slot = slot->ITL_back; + + if(slot == NULL) + /* no outstanding command to abort */ + return SUCCESS; + if(SCp->cmnd[0] == TEST_UNIT_READY) { + /* FIXME: This is because of a problem in the new + * error handler. When it is in error recovery, it + * will send a TUR to a device it thinks may still be + * showing a problem. If the TUR isn't responded to, + * it will abort it and mark the device off line. + * Unfortunately, it does no other error recovery, so + * this would leave us with an outstanding command + * occupying a slot. Rather than allow this to + * happen, we issue a bus reset to force all + * outstanding commands to terminate here. */ + NCR_700_internal_bus_reset(SCp->host); + /* still drop through and return failed */ + } + return FAILED; + +} + +STATIC int +NCR_700_bus_reset(Scsi_Cmnd * SCp) +{ + printk(KERN_INFO "scsi%d (%d:%d) New error handler wants BUS reset, cmd %p\n\t", + SCp->host->host_no, SCp->target, SCp->lun, SCp); + print_command(SCp->cmnd); + NCR_700_internal_bus_reset(SCp->host); + return SUCCESS; +} + +STATIC int +NCR_700_dev_reset(Scsi_Cmnd * SCp) +{ + printk(KERN_INFO "scsi%d (%d:%d) New error handler wants device reset\n\t", + SCp->host->host_no, SCp->target, SCp->lun); + print_command(SCp->cmnd); + + return FAILED; +} + +STATIC int +NCR_700_host_reset(Scsi_Cmnd * SCp) +{ + printk(KERN_INFO "scsi%d (%d:%d) New error handler wants HOST reset\n\t", + SCp->host->host_no, SCp->target, SCp->lun); + print_command(SCp->cmnd); + + NCR_700_internal_bus_reset(SCp->host); + NCR_700_chip_reset(SCp->host); + return SUCCESS; +} + +EXPORT_SYMBOL(NCR_700_detect); +EXPORT_SYMBOL(NCR_700_release); +EXPORT_SYMBOL(NCR_700_intr); diff -urN /md0/kernels/2.4/v2.4.9-ac14/fs/Makefile aio-v2.4.9-ac14.diff/fs/Makefile --- /md0/kernels/2.4/v2.4.9-ac14/fs/Makefile Mon Sep 24 02:14:15 2001 +++ aio-v2.4.9-ac14.diff/fs/Makefile Mon Sep 24 19:09:13 2001 @@ -12,7 +12,7 @@ obj-y := open.o read_write.o devices.o file_table.o buffer.o \ super.o block_dev.o char_dev.o stat.o exec.o pipe.o namei.o \ - fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \ + fcntl.o ioctl.o readdir.o select.o fifo.o locks.o aio.o \ dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \ filesystems.o jbd-kernel.o namespace.o diff -urN /md0/kernels/2.4/v2.4.9-ac14/fs/aio.c aio-v2.4.9-ac14.diff/fs/aio.c --- /md0/kernels/2.4/v2.4.9-ac14/fs/aio.c Wed Dec 31 19:00:00 1969 +++ aio-v2.4.9-ac14.diff/fs/aio.c Tue Sep 25 20:59:38 2001 @@ -0,0 +1,717 @@ +//#define DEBUG 1 +/* drivers/char/aio.c + * An async IO implementation for Linux + * Written by Benjamin LaHaise + * + * Implements /dev/aio, something on top of which it should be possible + * to write a POSIX AIO library. + * + * Copyright 2000, 2001 Red Hat, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ +#define DEBUG 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#undef KERN_DEBUG +#define KERN_DEBUG "" +#define MAX_IOCTXS 0x800 +#define dprintk(x...) do { ; } while (0) + +static spinlock_t aio_read_lock = SPIN_LOCK_UNLOCKED; +static spinlock_t aio_req_lock = SPIN_LOCK_UNLOCKED; + +static kmem_cache_t *kiocb_cachep; +static kmem_cache_t *kioctx_cachep; + +/* Lockless for reads. Needs replacement rsn. */ +static struct kioctx *ioctx_list; +static unsigned long new_ioctx_id; + +/* tunable. Needs to be added to sysctl. */ +int max_aio_reqs = 0x10000; + +/* aio_setup + * Creates the slab caches used by the aio routines, panic on + * failure as this is done early during the boot sequence. + */ +static int __init aio_setup(void) +{ + kiocb_cachep = kmem_cache_create("kiocb", sizeof(struct kiocb), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!kiocb_cachep) + panic("unable to create kiocb cache\n"); + + kioctx_cachep = kmem_cache_create("kioctx", sizeof(struct kioctx), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!kioctx_cachep) + panic("unable to create kioctx cache"); + + printk(KERN_NOTICE "aio_setup: okay!\n"); + printk(KERN_NOTICE "aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); + + return 0; +} + +/* ioctx_alloc + * Allocates and initializes an aioctx. Returns an ERR_PTR if it failed. + */ +static struct kioctx *ioctx_alloc(unsigned nr_reqs) +{ + struct kioctx *ctx; + unsigned i; + long size; + + if (nr_reqs > (0x70000000U / sizeof(struct io_event))) { + pr_debug("ENOMEM: nr_reqs too high\n"); + return ERR_PTR(-ENOMEM); + } + + /* Round off to a power of 2. Needed for cheap mask operations */ + for (i=1; imax_reqs = nr_reqs; + + atomic_set(&ctx->users, 1); + spin_lock_init(&ctx->lock); + init_waitqueue_head(&ctx->wait); + + size = sizeof(struct kiocb) * nr_reqs; + ctx->reqs = kmalloc(size, GFP_KERNEL); + if (!ctx->reqs) + goto out_freectx; + + memset(ctx->reqs, 0, size); + for (i=0; ireqs[i].ctx = ctx; + ctx->reqs[i].user_obj = ctx->reqs + i + 1; + } + ctx->reqs[nr_reqs-1].user_obj = NULL; + ctx->free_req = ctx->reqs; + size = sizeof(struct aio_ring); + size += sizeof(struct io_event) * nr_reqs; + /* This limits things somewhat for now. */ + ctx->ring = kmalloc(size, GFP_KERNEL); + if (!ctx->ring) + goto out_freereqs; + + memset(ctx->ring, 0, size); + ctx->mm = current->mm; + ctx->ring_mask = nr_reqs - 1; /* trusted copy */ + ctx->ring->mask = ctx->ring_mask; /* user copy */ + + /* now link into global list. kludge. FIXME */ + spin_lock(&aio_req_lock); /* FIXME */ + ctx->ring->id = ctx->user_id = new_ioctx_id++; /* FIXME */ + ctx->next = ioctx_list; /* FIXME */ + ioctx_list = ctx; /* FIXME */ + spin_unlock(&aio_req_lock); /* FIXME */ + + printk("aio: allocated aioctx %p[%ld]: mm=%p mask=0x%x\n", ctx, ctx->user_id, ctx->mm, ctx->ring->mask); + return ctx; + +out_freereqs: + kfree(ctx->reqs); +out_freectx: + kmem_cache_free(kioctx_cachep, ctx); + ctx = ERR_PTR(-ENOMEM); + + printk("aio: error allocating aioctx %p\n", ctx); + return ctx; +} + +/* __aioctx_put + * Called when the last user of an aio context has gone away, + * and the struct needs to be freed. + */ +void __aioctx_put(struct kioctx *ctx) +{ + printk("aio: free aioctx %p\n", ctx); + + kfree(ctx->ring); + kfree(ctx->reqs); + kmem_cache_free(kioctx_cachep, ctx); +} + +/* aio_get_req + * Allocate a slot for an aio request. Increments the users count + * of the kioctx so that the kioctx stays around until all requests are + * complete. Returns -EAGAIN if no requests are free. + */ +static inline struct kiocb *aio_get_req(struct kioctx *ctx) +{ + struct kiocb *req; + + /* FIXME: use cmpxchg instead of spin_lock? */ + spin_lock_irq(&ctx->lock); + req = ctx->free_req; + if (req) { + ctx->free_req = req->user_obj; + spin_unlock_irq(&ctx->lock); + req->user_obj = NULL; + + atomic_inc(&ctx->users); + return req; + } + spin_unlock_irq(&ctx->lock); + + return NULL; +} + +static void aio_put_req(struct kioctx *ctx, struct kiocb *req) +{ + //fput(req->filp); /* FIXME */ + if (req->filp && atomic_dec_and_test(&req->filp->f_count)) + BUG(); /* not really, but... */ + + req->filp = NULL; + req = ctx->reqs; + /* FIXME: use cmpxchg instead of spin_lock? */ + spin_lock_irq(&ctx->lock); + req->cancel = NULL; + req->user_obj = ctx->free_req; + ctx->free_req = req; + spin_unlock_irq(&ctx->lock); +} + +/* Lookup an ioctx id. ioctx_list is lockless for reads. + * FIXME: this is O(n) and is only suitable for development. + */ +static inline struct kioctx *get_ioctx(unsigned long ctx_id) +{ + struct kioctx *ioctx = ioctx_list; + struct mm_struct *mm = current->mm; + + do { + if (ioctx->user_id == ctx_id && ioctx->mm == mm) + return ioctx; + ioctx = ioctx->next; + } while (ioctx); + + return NULL; +} + +static inline void put_ioctx(struct kioctx *ctx) +{ + // FIXME!!! + //aioctx_put(ctx); +} + +/* aio_complete + * Called when the io request on the given iocb is complete. + */ +void aio_complete(struct kiocb *iocb, long res, long res2) +{ + struct kioctx *ctx = iocb->ctx; + struct aio_ring *ring = ctx->ring; + struct io_event *event; + unsigned long flags; + unsigned long tail; + + /* add a completion event to the ring buffer. + * must be done holding ctx->lock to prevent + * other code from messing with the tail + * pointer since we might be called from irq + * context. + */ + spin_lock_irqsave(&ctx->lock, flags); + + tail = ring->tail; + event = &ring->io_events[tail]; + tail = (tail + 1) & ring->mask; + + event->obj = (u64)(unsigned long)iocb->user_obj; + event->data = iocb->user_data; + event->res = res; + event->res2 = res2; + + dprintk("aio_complete: %p[%lu]: %p: %Lx %Lx %lx %lx\n", + ctx, tail, iocb, iocb->user_obj, iocb->user_data, res, res2); + + /* after flagging the request as done, we + * must never even look at it again + */ + barrier(); + + ring->tail = tail; + + wmb(); + if (!ring->woke) + ring->woke = 1; + + spin_unlock_irqrestore(&ctx->lock, flags); + + pr_debug("added to ring %p at [%lu]\n", iocb, tail); +#if 0 + if (!wake) { + printk("kio_complete: should send user of %p a signal...\n", ctx); + } +#endif + + wake_up(&ctx->wait); + + /* everything turned out well, dispose of the aiocb. */ + aio_put_req(ctx, iocb); +} + +/* aio_read_evt + * Pull an event off of the aioctx's event ring. + * FIXME: make this use cmpxchg. + * TODO: make the ringbuffer user mmap()able (requires FIXME). + */ +static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) +{ + struct aio_ring *ring = ioctx->ring; + unsigned long head; + int ret = -EAGAIN; + + pr_debug("in aio_read_evt h%lu t%lu\n", + (unsigned long)ring->head, (unsigned long)ring->tail); + barrier(); + if (ring->head == ring->tail) + goto out; + + spin_lock(&aio_read_lock); /* investigate the value of making this per-ctx */ + + head = ring->head; + if (head != ring->tail) { + *ent = ring->io_events[head]; + head = (head + 1) & ioctx->ring_mask; + barrier(); + ring->head = head; + ret = 0; + } + spin_unlock(&aio_read_lock); + +out: + pr_debug("leaving aio_read_evt: %d h%lu t%lu\n", ret, + (unsigned long)ring->head, (unsigned long)ring->tail); + return ret; +} + +struct timeout { + struct timer_list timer; + int timed_out; + wait_queue_head_t wait; +}; + +static void timeout_func(unsigned long data) +{ + struct timeout *to = (struct timeout *)data; + + to->timed_out = 1; + wake_up(&to->wait); +} + +static inline void init_timeout(struct timeout *to) +{ + init_timer(&to->timer); + to->timer.data = (unsigned long)to; + to->timer.function = timeout_func; + to->timed_out = 0; + init_waitqueue_head(&to->wait); +} + +static inline void set_timeout(struct timeout *to, struct timespec *ts) +{ + unsigned long how_long; + + if (!ts->tv_sec && !ts->tv_nsec) { + to->timed_out = 1; + return; + } + + how_long = ts->tv_sec * HZ; +#define HZ_NS (1000000000 / HZ) + how_long += (ts->tv_nsec + HZ_NS - 1) / HZ_NS; + + to->timer.expires = jiffies + how_long; + add_timer(&to->timer); +} + +static inline void clear_timeout(struct timeout *to) +{ + del_timer_sync(&to->timer); +} + +static int read_events(struct kioctx *ctx, int nr, struct io_event *event, + struct timespec *timeout) +{ + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + DECLARE_WAITQUEUE(to_wait, tsk); + int ret; + int i = 0; + struct io_event ent; + struct timespec ts; + struct timeout to; + + init_timeout(&to); + + if (timeout) { + ret = -EFAULT; + if (copy_from_user(&ts, timeout, sizeof(ts))) + goto out; + + set_timeout(&to, &ts); + } + + memset(&ent, 0, sizeof(ent)); + ret = 0; + + while (i < nr) { + ret = aio_read_evt(ctx, &ent); + if (ret) { + if (i) + break; + + ret = 0; + if (!i && !timeout) + break; + + add_wait_queue(&ctx->wait, &wait); + add_wait_queue(&to.wait, &to_wait); + do { + set_task_state(tsk, TASK_INTERRUPTIBLE); + + ret = aio_read_evt(ctx, &ent); + if (!ret) + break; + ret = -ETIMEDOUT; + if (to.timed_out) + break; + schedule(); + if (to.timed_out) + break; + if (signal_pending(tsk)) { + ret = -EINTR; + break; + } + ret = aio_read_evt(ctx, &ent); + } while (ret) ; + + set_task_state(tsk, TASK_RUNNING); + remove_wait_queue(&ctx->wait, &wait); + remove_wait_queue(&to.wait, &to_wait); + } + + if (ret) + break; + + pr_debug("read event: %Lx %Lx %Lx %Lx\n", + ent.data, ent.obj, ent.res, ent.res2); + + /* FIXME: split checks in two */ + ret = -EFAULT; + if (copy_to_user(event, &ent, sizeof(ent))) { + /* FIXME: we lose an event here. */ + printk(KERN_DEBUG "aio: lost an event due to EFAULT.\n"); + break; + } + + /* Now complete the aio request and copy the result codes to userland. */ + event ++; + i ++; + } + + if (timeout) + clear_timeout(&to); +out: + return i ? i : ret; +} + +asmlinkage long sys___io_setup(unsigned nr_reqs, aio_context_t *ctxp) +{ + struct kioctx *ioctx = NULL; + unsigned long ctx; + long ret; + + ret = get_user(ctx, ctxp); + if (ret) + goto out; + + ret = -EINVAL; + if (ctx || nr_reqs > max_aio_reqs) { + pr_debug("EINVAL: io_setup: !ctx or nr_reqs > max\n"); + goto out; + } + + ioctx = ioctx_alloc(nr_reqs); + ret = PTR_ERR(ioctx); + if (!IS_ERR(ioctx)) { + ret = put_user(ioctx->user_id, ctxp); + if (!ret) + return 0; + aioctx_put(ioctx); + } + +out: + return ret; +} + +/* aio_release + * Free the aioctx associated with the file. FIXME! + */ +asmlinkage long sys___io_destroy(aio_context_t ctx) +{ + struct kioctx *ioctx = get_ioctx(ctx); + if (ioctx) { + dprintk("aio_release(%p)\n", filp->private_data); + aioctx_put(ioctx); + return 0; + } + pr_debug("EINVAL: io_destroy: invalid context id\n"); + return -EINVAL; +} + +/* sys___io_submit + * Copy an aiocb from userspace into kernel space, then convert it to + * a kiocb, submit and repeat until done. Error codes on copy/submit + * only get returned for the first aiocb copied as otherwise the size + * of aiocbs copied is returned (standard write sematics). + */ +asmlinkage long sys___io_submit(aio_context_t ctx_id, int nr, struct iocb **iocbpp) +{ + struct kioctx *ctx; + long ret = 0; + int i; + + ctx = get_ioctx(ctx_id); + if (!ctx) { + pr_debug("EINVAL: io_submit: invalid context id\n"); + return -EINVAL; + } + + for (i=0; ireqs; + ret = put_user(tmp.aio_key, &iocbp->aio_key); + if (ret) + goto out_put_req; + + req->user_obj = iocbp; + req->user_data = tmp.aio_data; + + switch (tmp.aio_lio_opcode) { + case IOCB_CMD_PREAD: op = file->f_op->aio_read; break; + case IOCB_CMD_PREADX: op = file->f_op->aio_readx; break; + case IOCB_CMD_PWRITE: op = file->f_op->aio_write; break; + case IOCB_CMD_FSYNC: op = file->f_op->aio_fsync; break; + default: op = NULL; break; + } + ret = -EINVAL; + if (!op) { + pr_debug("EINVAL: io_submit: no operation provided\n"); + goto out_put_req; + } + + ret = op(file, req, tmp); + if (!ret) + continue; + + pr_debug("io_submit: op returned %ld\n", ret); + + out_put_req: + aio_put_req(ctx, req); + out_fput: + fput(file); + break; + } + + put_ioctx(ctx); + run_task_queue(&tq_disk); + return i ? i : ret; +} + +void generic_aio_complete(void *_iocb, struct kvec *vec, ssize_t res) +{ + struct kiocb *iocb = _iocb; + + aio_complete(iocb, res, 0); +} + +ssize_t generic_aio_read(struct file *file, struct kiocb *req, struct iocb iocb, size_t min_size) +{ + unsigned long buf = iocb.aio_buf; + size_t size = iocb.aio_nbytes; + ssize_t nr_read = 0; + loff_t pos = iocb.aio_offset; + kvec_cb_t cb; + + if (file->f_op->new_read) { + nr_read = file->f_op->new_read(file, (void *)buf, size, + &pos, F_ATOMIC); + if (-EAGAIN == nr_read) + nr_read = 0; + if ((nr_read >= min_size) || (nr_read < 0)) + return nr_read; + } + + req->nr_read = nr_read; + size -= nr_read; + buf += nr_read; + cb.vec = map_user_kvec(READ, buf, size); + cb.fn = generic_aio_complete; + cb.data = req; + + printk("generic_aio_read: cb.vec=%p\n", cb.vec); + if (IS_ERR(cb.vec)) + return nr_read ? nr_read : PTR_ERR(cb.vec); + + return file->f_op->kvec_read(file, cb, size, pos); +} + +ssize_t generic_file_aio_read(struct file *file, struct kiocb *req, struct iocb iocb) +{ + return generic_aio_read(file, req, iocb, iocb.aio_nbytes); +} + +ssize_t generic_aio_write(struct file *file, struct kiocb *req, struct iocb iocb, size_t min_size) +{ + unsigned long buf = iocb.aio_buf; + size_t size = iocb.aio_nbytes; + ssize_t nr_written = 0; + kvec_cb_t cb; + + if (file->f_op->new_write) { + nr_written = file->f_op->new_write(file, (void *)buf, size, + &iocb.aio_offset, F_ATOMIC); + if (-EAGAIN == nr_written) + nr_written = 0; + if ((nr_written >= min_size) || (nr_written < 0)) + return nr_written; + } + + size -= nr_written; + buf += nr_written; + cb.vec = map_user_kvec(WRITE, buf, size); + cb.fn = generic_aio_complete; + cb.data = req; + + if (IS_ERR(cb.vec)) + return nr_written ? nr_written : PTR_ERR(cb.vec); + + return file->f_op->kvec_write(file, cb, size, iocb.aio_offset); +} + +ssize_t generic_file_aio_write(struct file *file, struct kiocb *req, struct iocb iocb) +{ + return generic_aio_write(file, req, iocb, iocb.aio_nbytes); +} + +asmlinkage long sys___io_cancel(aio_context_t ctx, struct iocb *iocb) +{ + return -ENOSYS; +} + +asmlinkage long sys___io_wait(aio_context_t ctx_id, struct iocb *iocb, struct timespec *timeout) +{ +#if 0 /* FIXME. later. */ + struct kioctx *ioctx; + long ret = -EINVAL; + unsigned key; + long obj = (long)iocb; + + ioctx = get_ioctx(ctx_id); + if (!ioctx) + goto out; + + ret = get_user(key, &iocb->aio_key); + if (ret) + goto out; + + ret = __aio_complete(ioctx, key, obj, !!timeout); + put_ioctx(ioctx); + +out: + return ret; +#endif + return -ENOSYS; +} + +asmlinkage long sys___io_getevents(int ctx_id, int nr, struct io_event *events, + struct timespec *timeout) +{ + struct kioctx *ioctx = get_ioctx(ctx_id); + long ret = -EINVAL; + + if (ioctx) { + ret = read_events(ioctx, nr, events, timeout); + put_ioctx(ioctx); + } + + return ret; +} + +__initcall(aio_setup); diff -urN /md0/kernels/2.4/v2.4.9-ac14/fs/buffer.c aio-v2.4.9-ac14.diff/fs/buffer.c --- /md0/kernels/2.4/v2.4.9-ac14/fs/buffer.c Mon Sep 24 02:14:15 2001 +++ aio-v2.4.9-ac14.diff/fs/buffer.c Mon Sep 24 21:13:27 2001 @@ -141,8 +141,7 @@ { clear_bit(BH_Lock, &bh->b_state); smp_mb__after_clear_bit(); - if (waitqueue_active(&bh->b_wait)) - wake_up(&bh->b_wait); + wake_up(&bh->b_wait); } /* @@ -2066,6 +2065,7 @@ return tmp.b_blocknr; } +#if 1 /* * IO completion routine for a buffer_head being used for kiobuf IO: we * can't dispatch the kiobuf callback until io_count reaches 0. @@ -2242,6 +2242,7 @@ return transferred; return err; } +#endif /* * Start I/O on a page. @@ -2873,3 +2874,223 @@ module_init(bdflush_init) +/* async kio interface */ +struct brw_cb { + kvec_cb_t cb; + atomic_t io_count; + int nr; + struct buffer_head *bh[1]; +}; + +static inline void brw_cb_put(struct brw_cb *brw_cb) +{ + if (atomic_dec_and_test(&brw_cb->io_count)) { + ssize_t res = 0, err = 0; + int nr; + + /* Walk the buffer heads associated with this kiobuf + * checking for errors and freeing them as we go. + */ + for (nr=0; nr < brw_cb->nr; nr++) { + struct buffer_head *bh = brw_cb->bh[nr]; + if (!err && buffer_uptodate(bh)) + res += bh->b_size; + else + err = -EIO; + kmem_cache_free(bh_cachep, bh); + } + + if (!res) + res = err; + + brw_cb->cb.fn(brw_cb->cb.data, brw_cb->cb.vec, res); + + kfree(brw_cb); + } +} + +/* + * IO completion routine for a buffer_head being used for kiobuf IO: we + * can't dispatch the kiobuf callback until io_count reaches 0. + */ + +static void end_buffer_io_kiobuf_async(struct buffer_head *bh, int uptodate) +{ + struct brw_cb *brw_cb; + + mark_buffer_uptodate(bh, uptodate); + + brw_cb = bh->b_private; + unlock_buffer(bh); + + brw_cb_put(brw_cb); +} + + +/* + * Start I/O on a physical range of kernel memory, defined by a vector + * of kiobuf structs (much like a user-space iovec list). + * + * The kiobuf must already be locked for IO. IO is submitted + * asynchronously: you need to check page->locked, page->uptodate, and + * maybe wait on page->wait. + * + * It is up to the caller to make sure that there are enough blocks + * passed in to completely map the iobufs to disk. + */ + +int brw_kvec_async(int rw, kvec_cb_t cb, kdev_t dev, unsigned blocks, unsigned long blknr, int sector_shift) +{ + struct kvec *vec = cb.vec; + struct kveclet *veclet; + int err; + int length; + unsigned sector_size = 1 << sector_shift; + int i; + + struct brw_cb *brw_cb; + + printk("vec: %p\n", vec); + if (!vec->nr) + BUG(); + + /* + * First, do some alignment and validity checks + */ + length = 0; + for (veclet=vec->veclet, i=0; i < vec->nr; i++,veclet++) { + length += veclet->length; + if ((veclet->offset & (sector_size-1)) || + (veclet->length & (sector_size-1))) { + printk("brw_kiovec_async: tuple[%d]->offset=0x%x length=0x%x sector_size: 0x%x\n", i, veclet->offset, veclet->length, sector_size); + return -EINVAL; + } + } + + if (length < (blocks << sector_shift)) + BUG(); + + /* + * OK to walk down the iovec doing page IO on each page we find. + */ + err = 0; + + if (!blocks) { + printk("brw_kiovec_async: !i\n"); + return -EINVAL; + } + + /* FIXME: tie into userbeans here */ + brw_cb = kmalloc(sizeof(*brw_cb) + (blocks * sizeof(struct buffer_head *)), GFP_KERNEL); + if (!brw_cb) + return -ENOMEM; + + brw_cb->cb = cb; + brw_cb->nr = 0; + + /* This is ugly. FIXME. */ + for (i=0, veclet=vec->veclet; inr; i++,veclet++) { + struct page *page = veclet->page; + unsigned offset = veclet->offset; + unsigned length = veclet->length; + + if (!page) + BUG(); + + while (length > 0) { + struct buffer_head *tmp; + tmp = kmem_cache_alloc(bh_cachep, GFP_NOIO); + err = -ENOMEM; + if (!tmp) + goto error; + + memset(tmp, 0, sizeof(*tmp)); + init_waitqueue_head(&tmp->b_wait); + tmp->b_dev = B_FREE; + tmp->b_size = sector_size; + set_bh_page(tmp, page, offset); + tmp->b_this_page = tmp; + + init_buffer(tmp, end_buffer_io_kiobuf_async, NULL); + tmp->b_dev = dev; + tmp->b_blocknr = blknr++; + tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) + | (1 << BH_Req); + tmp->b_private = brw_cb; + + if (rw == WRITE) { + set_bit(BH_Uptodate, &tmp->b_state); + clear_bit(BH_Dirty, &tmp->b_state); + } + + brw_cb->bh[brw_cb->nr++] = tmp; + length -= sector_size; + offset += sector_size; + + if (offset >= PAGE_SIZE) { + offset = 0; + break; + } + + if (brw_cb->nr >= blocks) + goto submit; + } /* End of block loop */ + } /* End of page loop */ + +submit: + atomic_set(&brw_cb->io_count, brw_cb->nr+1); + /* okay, we've setup all our io requests, now fire them off! */ + for (i=0; inr; i++) + submit_bh(rw, brw_cb->bh[i]); + brw_cb_put(brw_cb); + + return 0; + +error: + /* Walk brw_cb_table freeing all the goop associated with each kiobuf */ + if (brw_cb) { + /* We got an error allocating the bh'es. Just free the current + buffer_heads and exit. */ + for (i = brw_cb->nr-1; i--; ) + kmem_cache_free(bh_cachep, brw_cb->bh[i]); + kfree(brw_cb); + } + + return err; +} +#if 0 +int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], + kdev_t dev, int nr_blocks, unsigned long b[], int sector_size) +{ + int i; + int transferred = 0; + int err = 0; + + if (!nr) + return 0; + + /* queue up and trigger the io */ + err = brw_kiovec_async(rw, nr, iovec, dev, nr_blocks, b, sector_size); + if (err) + goto out; + + /* wait on the last iovec first -- it's more likely to finish last */ + for (i=nr; --i >= 0; ) + kiobuf_wait_for_io(iovec[i]); + + run_task_queue(&tq_disk); + + /* okay, how much data actually got through? */ + for (i=0; ierrno) { + if (!err) + err = iovec[i]->errno; + break; + } + transferred += iovec[i]->length; + } + +out: + return transferred ? transferred : err; +} +#endif diff -urN /md0/kernels/2.4/v2.4.9-ac14/fs/ext2/file.c aio-v2.4.9-ac14.diff/fs/ext2/file.c --- /md0/kernels/2.4/v2.4.9-ac14/fs/ext2/file.c Mon Sep 24 02:14:15 2001 +++ aio-v2.4.9-ac14.diff/fs/ext2/file.c Tue Sep 25 14:02:13 2001 @@ -47,6 +47,10 @@ open: generic_file_open, release: ext2_release_file, fsync: ext2_sync_file, + aio_read: generic_file_aio_read, + aio_write: generic_file_aio_write, + kvec_read: generic_file_kvec_read, + kvec_write: generic_file_kvec_write, }; struct inode_operations ext2_file_inode_operations = { diff -urN /md0/kernels/2.4/v2.4.9-ac14/fs/nfs/file.c aio-v2.4.9-ac14.diff/fs/nfs/file.c --- /md0/kernels/2.4/v2.4.9-ac14/fs/nfs/file.c Mon Sep 24 02:14:16 2001 +++ aio-v2.4.9-ac14.diff/fs/nfs/file.c Mon Sep 24 19:09:13 2001 @@ -50,6 +50,7 @@ release: nfs_release, fsync: nfs_fsync, lock: nfs_lock, + //rw_kiovec: generic_file_rw_kiovec, }; struct inode_operations nfs_file_inode_operations = { diff -urN /md0/kernels/2.4/v2.4.9-ac14/fs/select.c aio-v2.4.9-ac14.diff/fs/select.c --- /md0/kernels/2.4/v2.4.9-ac14/fs/select.c Mon Sep 24 02:14:16 2001 +++ aio-v2.4.9-ac14.diff/fs/select.c Mon Sep 24 19:11:26 2001 @@ -12,23 +12,31 @@ * 24 January 2000 * Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation * of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian). + * June 2001 + * Added async_poll implementation. -ben */ +#include #include #include #include #include /* for STICKY_TIMEOUTS */ #include +#include +#include #include #define ROUND_UP(x,y) (((x)+(y)-1)/(y)) #define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM) +static kmem_cache_t *poll_table_cache; + struct poll_table_entry { - struct file * filp; - wait_queue_t wait; - wait_queue_head_t * wait_address; + wait_queue_t wait; + wait_queue_head_t *wait_address; + struct file *filp; + poll_table *p; }; struct poll_table_page { @@ -72,6 +80,72 @@ } } +void async_poll_complete(void *data) +{ + poll_table *p = data, *pwait; + struct kiocb *iocb = p->iocb; + unsigned int mask; + + pwait = p; + p->wake = 0; + wmb(); + do { + mask = iocb->filp->f_op->poll(iocb->filp, p); + mask &= p->events | POLLERR | POLLHUP; + if (mask) { + poll_freewait(p); + aio_complete(iocb, mask, 0); + return; + } + p->sync = 0; + wmb(); + } while (p->wake); + +} + +static void async_poll_waiter(wait_queue_t *wait) +{ + struct poll_table_entry *entry = (struct poll_table_entry *)wait; + poll_table *p = entry->p; + + /* avoid writes to the cacheline if possible for SMP */ + if (!p->wake) { + p->wake = 1; + /* ensure only one wake up queues the wtd */ + if (!p->sync && !test_and_set_bit(0, &p->sync)) + wtd_queue(&p->wtd); + } +} + +int async_poll(struct kiocb *iocb, int events) +{ + unsigned int mask; + poll_table *p, *pwait; + + p = kmem_cache_alloc(poll_table_cache, SLAB_KERNEL); + if (!p) + return -ENOMEM; + + poll_initwait(p); + wtd_set_action(&p->wtd, async_poll_complete, p); + p->iocb = iocb; + p->wake = 0; + p->sync = 0; + p->events = events; + pwait = p; + + mask = DEFAULT_POLLMASK; + if (iocb->filp->f_op && iocb->filp->f_op->poll) + mask = iocb->filp->f_op->poll(iocb->filp, p); + mask &= events | POLLERR | POLLHUP; + if (mask) { + poll_freewait(p); + aio_complete(iocb, mask, 0); + } + + return 0; +} + void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p) { struct poll_table_page *table = p->table; @@ -98,7 +172,11 @@ get_file(filp); entry->filp = filp; entry->wait_address = wait_address; - init_waitqueue_entry(&entry->wait, current); + entry->p = p; + if (p->iocb) + init_waitqueue_func_entry(&entry->wait, async_poll_waiter); + else + init_waitqueue_entry(&entry->wait, current); add_wait_queue(wait_address,&entry->wait); } } @@ -494,3 +572,14 @@ poll_freewait(&table); return err; } + +static int __init poll_init(void) +{ + poll_table_cache = kmem_cache_create("poll table", + sizeof(poll_table), 0, 0, NULL, NULL); + if (!poll_table_cache) + panic("unable to alloc poll_table_cache"); + return 0; +} + +module_init(poll_init); diff -urN /md0/kernels/2.4/v2.4.9-ac14/include/asm-i386/errno.h aio-v2.4.9-ac14.diff/include/asm-i386/errno.h --- /md0/kernels/2.4/v2.4.9-ac14/include/asm-i386/errno.h Mon Feb 26 10:20:14 2001 +++ aio-v2.4.9-ac14.diff/include/asm-i386/errno.h Mon Sep 24 19:09:13 2001 @@ -128,5 +128,6 @@ #define ENOMEDIUM 123 /* No medium found */ #define EMEDIUMTYPE 124 /* Wrong medium type */ +#define ENOAIO 125 /* fd does not support aio */ #endif diff -urN /md0/kernels/2.4/v2.4.9-ac14/include/asm-i386/unistd.h aio-v2.4.9-ac14.diff/include/asm-i386/unistd.h --- /md0/kernels/2.4/v2.4.9-ac14/include/asm-i386/unistd.h Fri Aug 11 17:39:23 2000 +++ aio-v2.4.9-ac14.diff/include/asm-i386/unistd.h Mon Sep 24 19:09:13 2001 @@ -227,9 +227,18 @@ #define __NR_madvise1 219 /* delete when C lib stub is removed */ #define __NR_getdents64 220 #define __NR_fcntl64 221 +/* reserved for tux 222 */ +#define __NR___io_setup 223 +#define __NR___io_destroy 224 +#define __NR___io_getevents 225 +#define __NR___io_submit 226 +#define __NR___io_cancel 227 +#define __NR___io_wait 228 /* user-visible error numbers are in the range -1 - -124: see */ - +#ifdef NO_SYSCALL_ERRNO +#define __syscall_return(type, res) return (type)(res) +#else #define __syscall_return(type, res) \ do { \ if ((unsigned long)(res) >= (unsigned long)(-125)) { \ @@ -238,6 +247,7 @@ } \ return (type) (res); \ } while (0) +#endif /* XXX - _foo needs to be __foo, while __NR_bar could be _NR_bar. */ #define _syscall0(type,name) \ diff -urN /md0/kernels/2.4/v2.4.9-ac14/include/linux/aio.h aio-v2.4.9-ac14.diff/include/linux/aio.h --- /md0/kernels/2.4/v2.4.9-ac14/include/linux/aio.h Wed Dec 31 19:00:00 1969 +++ aio-v2.4.9-ac14.diff/include/linux/aio.h Mon Sep 24 21:54:55 2001 @@ -0,0 +1,131 @@ +/* linux/aio.h + * Written by Benjamin LaHaise + */ +#ifndef __LINUX__AIO_H +#define __LINUX__AIO_H + +#include + +typedef unsigned long aio_context_t; + +enum { + IOCB_CMD_PREAD = 0, + IOCB_CMD_PWRITE = 1, + IOCB_CMD_FSYNC = 2, + IOCB_CMD_FDSYNC = 3, + IOCB_CMD_PREADX = 4, +}; + +/* read() from /dev/aio returns these structures. */ +struct io_event { + __u64 data; /* the data field from the iocb */ + __u64 obj; /* what iocb this event came from */ + __s64 res; /* result code for this event */ + __s64 res2; /* secondary result */ +}; + +struct aio_ring { + __u32 id; /* kernel internal index number */ + __u32 mask; /* number of io_events - 1 */ + __u32 head; + __u32 tail; + + __u32 woke; /* set when a wakeup was sent */ + __u32 pad1; + __u32 pad2; + __u32 pad3; + + __u32 pad4[24]; /* pad out to 128 bytes */ + + struct io_event io_events[0]; +}; /* 128 bytes + ring size */ + +#if defined(__LITTLE_ENDIAN) +#define PADDED(x,y) x, y +#elif defined(__BIG_ENDIAN) +#define PADDED(x,y) y, x +#else +#error edit for your odd byteorder. +#endif + +/* + * we always use a 64bit off_t when communicating + * with userland. its up to libraries to do the + * proper padding and aio_error abstraction + */ + +struct iocb { + /* these are internal to the kernel/libc. */ + __u64 aio_data; /* data to be returned in event's data */ + __u32 PADDED(aio_key, aio_reserved1); + /* the kernel sets aio_key to the req # */ + + /* common fields */ + __u16 aio_lio_opcode; /* see IOCB_CMD_ above */ + __s16 aio_reqprio; + __u32 aio_fildes; + + __u64 aio_buf; + __u64 aio_nbytes; + __s64 aio_offset; + + /* extra parameters */ + __u64 aio_reserved2; + __u64 aio_reserved3; +}; /* 64 bytes */ + +#undef IFBIG +#undef IFLITTLE + +#ifdef __KERNEL__ +#ifndef __LINUX__KIOVEC_H +#include +#endif +#include + +#define AIO_MAXSEGS 4 +#define AIO_KIOGRP_NR_ATOMIC 8 + +struct kioctx; + +struct kiocb { + void (*cancel)(void *data, struct kioctx *ctx, int idx); + struct file *filp; + struct kioctx *ctx; + void *user_obj; + __u64 user_data; + ssize_t nr_read; +}; + +struct kioctx { + atomic_t users; + + /* This needs improving */ + unsigned long user_id; + struct kioctx *next; + struct mm_struct *mm; + + wait_queue_head_t wait; + + spinlock_t lock; + + struct kiocb *reqs; + struct kiocb *free_req; + + unsigned max_reqs; + unsigned ring_mask; + struct aio_ring *ring; +}; + +extern struct file_operations aio_fops; + +extern void aio_complete(struct kiocb *iocb, long res, long res2); +extern void __aioctx_put(struct kioctx *ctx); + +#define aioctx_get(kioctx) atomic_inc(&(kioctx)->users) +#define aioctx_put(kioctx) do { if (atomic_dec_and_test(&(kioctx)->users)) __aioctx_put(kioctx); } while (0) + +#endif /*__KERNEL__*/ + +#endif /* __AIO_H__ */ + diff -urN /md0/kernels/2.4/v2.4.9-ac14/include/linux/brlock.h aio-v2.4.9-ac14.diff/include/linux/brlock.h --- /md0/kernels/2.4/v2.4.9-ac14/include/linux/brlock.h Mon Sep 24 02:14:16 2001 +++ aio-v2.4.9-ac14.diff/include/linux/brlock.h Mon Sep 24 21:55:50 2001 @@ -34,6 +34,7 @@ enum brlock_indices { BR_GLOBALIRQ_LOCK, BR_NETPROTO_LOCK, + BR_AIO_LOCK, __BR_END }; diff -urN /md0/kernels/2.4/v2.4.9-ac14/include/linux/compiler.h aio-v2.4.9-ac14.diff/include/linux/compiler.h --- /md0/kernels/2.4/v2.4.9-ac14/include/linux/compiler.h Wed Dec 31 19:00:00 1969 +++ aio-v2.4.9-ac14.diff/include/linux/compiler.h Mon Sep 24 02:16:05 2001 @@ -0,0 +1,16 @@ +#ifndef __LINUX_COMPILER_H +#define __LINUX_COMPILER_H + +/* Somewhere in the middle of the GCC 2.96 development cycle, we implemented + a mechanism by which the user can annotate likely branch directions and + expect the blocks to be reordered appropriately. Define __builtin_expect + to nothing for earlier compilers. */ + +#if __GNUC__ == 2 && __GNUC_MINOR__ < 96 +#define __builtin_expect(x, expected_value) (x) +#endif + +#define likely(x) __builtin_expect((x),1) +#define unlikely(x) __builtin_expect((x),0) + +#endif /* __LINUX_COMPILER_H */ diff -urN /md0/kernels/2.4/v2.4.9-ac14/include/linux/event.h aio-v2.4.9-ac14.diff/include/linux/event.h --- /md0/kernels/2.4/v2.4.9-ac14/include/linux/event.h Wed Dec 31 19:00:00 1969 +++ aio-v2.4.9-ac14.diff/include/linux/event.h Mon Sep 24 19:09:13 2001 @@ -0,0 +1,21 @@ +#ifndef _LINUX_KEVENTQ_H +#define _LINUX_KEVENTQ_H + +typedef struct file *keventq_t; + +keventq_t keventq_get(int qid); +#define keventq_put(evq) fput(evq) + +keventq_t keventq_get(int qid) +{ + struct file *filp = fget(qid); + if (filp) { + if (&keventq_fops == filp->f_op) + return filp; + fput(filp); + } + return NULL; +} + + +#endif diff -urN /md0/kernels/2.4/v2.4.9-ac14/include/linux/fs.h aio-v2.4.9-ac14.diff/include/linux/fs.h --- /md0/kernels/2.4/v2.4.9-ac14/include/linux/fs.h Mon Sep 24 02:14:16 2001 +++ aio-v2.4.9-ac14.diff/include/linux/fs.h Tue Sep 25 14:06:25 2001 @@ -20,7 +20,6 @@ #include #include #include -#include #include #include @@ -803,7 +802,21 @@ * NOTE: * read, write, poll, fsync, readv, writev can be called * without the big kernel lock held in all filesystems. - */ + * + * rw_kiovec returns the number of bytes that will actually + * be transferred into the kiovec, or an error that occurred + * during queueing. + */ +struct iocb; +struct kioctx; +struct kiocb; +struct kiobuf; +#include /* FIXME */ +#include + +#define F_ATOMIC 0x0001 +#define F_OFFSETOK 0x0002 + struct file_operations { struct module *owner; loff_t (*llseek) (struct file *, loff_t, int); @@ -823,6 +836,20 @@ ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *); ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + + + /* this will replace read/write ops above in 2.5 */ + ssize_t (*new_read) (struct file *, char *, size_t, loff_t *, int); + ssize_t (*new_write) (struct file *, char *, size_t, loff_t *, int); + + ssize_t (*aio_read)(struct file *, struct kiocb *, struct iocb); + ssize_t (*aio_readx)(struct file *, struct kiocb *, struct iocb); + ssize_t (*aio_write)(struct file *, struct kiocb *, struct iocb); + ssize_t (*aio_fsync)(struct file *, struct kiocb *, struct iocb); + + /* in-kernel async api */ + int (*kvec_read)(struct file *, kvec_cb_t, size_t, loff_t); + int (*kvec_write)(struct file *, kvec_cb_t, size_t, loff_t); }; struct inode_operations { @@ -1401,6 +1428,12 @@ unsigned long *); extern int block_sync_page(struct page *); +extern int generic_aio_read(struct file *, struct kiocb *, struct iocb, size_t); +extern int generic_aio_write(struct file *, struct kiocb *, struct iocb, size_t); +extern int generic_file_aio_read(struct file *, struct kiocb *, struct iocb); +extern int generic_file_aio_write(struct file *, struct kiocb *, struct iocb); +extern int generic_file_kvec_read(struct file *, kvec_cb_t, size_t, loff_t); +extern int generic_file_kvec_write(struct file *, kvec_cb_t, size_t, loff_t); int generic_block_bmap(struct address_space *, long, get_block_t *); int generic_commit_write(struct file *, struct page *, unsigned, unsigned); int block_truncate_page(struct address_space *, loff_t, get_block_t *); @@ -1411,6 +1444,7 @@ extern ssize_t generic_file_read(struct file *, char *, size_t, loff_t *); extern ssize_t generic_file_write(struct file *, const char *, size_t, loff_t *); extern void do_generic_file_read(struct file *, loff_t *, read_descriptor_t *, read_actor_t); +extern int generic_file_rw_kiovec(struct file *filp, int rw, int nr, struct kiobuf **kiovec, int flags, size_t size, loff_t pos); extern ssize_t generic_read_dir(struct file *, char *, size_t, loff_t *); extern loff_t generic_file_llseek(struct file *, loff_t, int); diff -urN /md0/kernels/2.4/v2.4.9-ac14/include/linux/iobuf.h aio-v2.4.9-ac14.diff/include/linux/iobuf.h --- /md0/kernels/2.4/v2.4.9-ac14/include/linux/iobuf.h Mon Sep 24 02:14:16 2001 +++ aio-v2.4.9-ac14.diff/include/linux/iobuf.h Tue Sep 25 14:09:31 2001 @@ -53,8 +53,10 @@ /* Dynamic state for IO completion: */ atomic_t io_count; /* IOs still in progress */ + int transferred; /* Number of bytes of completed IO at the beginning of the buffer */ int errno; /* Status of completed IO */ void (*end_io) (struct kiobuf *); /* Completion callback */ + void *end_io_data; wait_queue_head_t wait_queue; }; @@ -80,6 +82,8 @@ /* fs/buffer.c */ +int brw_kiovec_async(int rw, int nr, struct kiobuf *iovec[], + kdev_t dev, int nr_blocks, unsigned long b[], int size); int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], kdev_t dev, unsigned long b[], int size); diff -urN /md0/kernels/2.4/v2.4.9-ac14/include/linux/kiovec.h aio-v2.4.9-ac14.diff/include/linux/kiovec.h --- /md0/kernels/2.4/v2.4.9-ac14/include/linux/kiovec.h Wed Dec 31 19:00:00 1969 +++ aio-v2.4.9-ac14.diff/include/linux/kiovec.h Mon Sep 24 19:14:01 2001 @@ -0,0 +1,36 @@ +#ifndef __LINUX__IOBUF_H +#define __LINUX__IOBUF_H + +struct page; + +struct kveclet { + struct page *page; + unsigned offset; + unsigned length; +}; + +struct kvec { + unsigned max_nr; + unsigned nr; + struct kveclet veclet[0]; +}; + +struct kvec_cb { + struct kvec *vec; + void (*fn)(void *data, struct kvec *vec, ssize_t res); + void *data; +}; + +#ifndef _LINUX_TYPES_H +#include +#endif +#ifndef _LINUX_KDEV_T_H +#include +#endif + +extern struct kvec *map_user_kvec(int rw, unsigned long va, size_t len); +extern void unmap_kvec(struct kvec *); +extern int brw_kvec_async(int rw, kvec_cb_t cb, kdev_t dev, unsigned count, + unsigned long blknr, int sector_shift); + +#endif diff -urN /md0/kernels/2.4/v2.4.9-ac14/include/linux/lib_lio.h aio-v2.4.9-ac14.diff/include/linux/lib_lio.h --- /md0/kernels/2.4/v2.4.9-ac14/include/linux/lib_lio.h Wed Dec 31 19:00:00 1969 +++ aio-v2.4.9-ac14.diff/include/linux/lib_lio.h Mon Sep 24 19:09:13 2001 @@ -0,0 +1,108 @@ +#ifndef __LIB_LIO_H +#define __LIB_LIO_H + +struct timespec; +struct sockaddr; +struct iovec; + + +typedef enum lio_iocb_cmd { + + LIO_CMD_PREAD, + LIO_CMD_PWRITE, + LIO_CMD_ACCEPT, + LIO_CMD_CONNECT, + LIO_CMD_SENDTO, + LIO_CMD_RECVFROM, + + LIO_CMD_POLL, +} lio_iocb_cmd_t; + +struct lio_iocb_sendto { + void *msg; + int len; + int flags; + struct sockaddr *addr; +}; + +struct lio_iocb_poll { + int events; +}; /* result code is the set of result flags or -'ve errno */ + +struct lio_iocb_sockaddr { + struct sockaddr *addr; + int len; +}; /* result code is the length of the sockaddr, or -'ve errno */ + +struct lio_iocb_common { + void *buf; + long nbytes; + long long offset; +}; /* result code is the amount read or -'ve errno */ + +struct lio_iocb_vector { + const struct iovec *vec; + int nr; + long long offset; +}; /* result code is the amount read or -'ve errno */ + +typedef struct lio_iocb { + long key; /* For use in identifying io requests */ + void *data; /* Return in the io completion event */ + int aio_fildes; + short aio_reqprio; + short aio_lio_opcode; + + union { + struct lio_iocb_common c; + struct lio_iocb_vector v; + struct lio_iocb_poll poll; + struct lio_iocb_sockaddr saddr; + } u; +} lio_iocb_t; + +typedef void (*lio_callback_t)(int qid, lio_iocb_t *iocb, long result); + +extern int lio_queue_init(int maxevents); +extern int lio_queue_grow(int qid, int new_maxevents); +extern int lio_queue_release(int qid); +extern int lio_queue_wait(int qid, struct timespec *timeout); +extern int lio_queue_run(int qid); +extern int lio_submit(int qid, int nr, lio_iocb_t *ios[]); + +static inline void lio_prep_accept(lio_iocb_t *iocb, int s, struct sockaddr *addr, int addrlen) +{ + iocb->aio_fildes = s; + iocb->aio_lio_opcode = LIO_CMD_ACCEPT; + iocb->aio_reqprio = 0; + iocb->u.c.buf = addr; + iocb->u.c.nbytes = addrlen; + iocb->u.c.offset = 0; +} + +static inline void lio_prep_pread(lio_iocb_t *iocb, int fd, void *buf, long count, long long offset) +{ + iocb->aio_fildes = fd; + iocb->aio_lio_opcode = LIO_CMD_PREAD; + iocb->aio_reqprio = 0; + iocb->u.c.buf = buf; + iocb->u.c.nbytes = count; + iocb->u.c.offset = offset; +} + +static inline void lio_prep_poll(lio_iocb_t *iocb, lio_callback_t *cb, int fd, int events) +{ + iocb->data = cb; + iocb->aio_fildes = fd; + iocb->aio_lio_opcode = LIO_CMD_POLL; + iocb->aio_reqprio = 0; + iocb->u.poll.events = events; +} + +static inline int lio_poll(int qid, lio_iocb_t *iocb, lio_callback_t *cb, int fd, int events) +{ + lio_prep_poll(iocb, cb, fd, events); + return lio_submit(qid, 1, &iocb); +} + +#endif diff -urN /md0/kernels/2.4/v2.4.9-ac14/include/linux/mm.h aio-v2.4.9-ac14.diff/include/linux/mm.h --- /md0/kernels/2.4/v2.4.9-ac14/include/linux/mm.h Mon Sep 24 02:14:16 2001 +++ aio-v2.4.9-ac14.diff/include/linux/mm.h Tue Sep 25 14:09:29 2001 @@ -322,8 +322,7 @@ smp_mb__before_clear_bit(); \ if (!test_and_clear_bit(PG_locked, &(page)->flags)) BUG(); \ smp_mb__after_clear_bit(); \ - if (waitqueue_active(&(page)->wait)) \ - wake_up(&(page)->wait); \ + wake_up(&(page)->wait); \ } while (0) #define PageError(page) test_bit(PG_error, &(page)->flags) #define SetPageError(page) set_bit(PG_error, &(page)->flags) diff -urN /md0/kernels/2.4/v2.4.9-ac14/include/linux/net.h aio-v2.4.9-ac14.diff/include/linux/net.h --- /md0/kernels/2.4/v2.4.9-ac14/include/linux/net.h Mon Sep 24 21:54:50 2001 +++ aio-v2.4.9-ac14.diff/include/linux/net.h Mon Sep 24 21:54:55 2001 @@ -83,6 +83,9 @@ struct scm_cookie; struct vm_area_struct; struct page; +struct iocb; +struct kioctx; +#include /* shut gcc up */ struct proto_ops { int family; @@ -110,6 +113,7 @@ int (*recvmsg) (struct socket *sock, struct msghdr *m, int total_len, int flags, struct scm_cookie *scm); int (*mmap) (struct file *file, struct socket *sock, struct vm_area_struct * vma); ssize_t (*sendpage) (struct socket *sock, struct page *page, int offset, size_t size, int flags); + int (*begin_read) (struct socket *sock, struct kioctx *ctx, struct iocb iocb, struct iocb *iocbptr); }; struct net_proto_family diff -urN /md0/kernels/2.4/v2.4.9-ac14/include/linux/poll.h aio-v2.4.9-ac14.diff/include/linux/poll.h --- /md0/kernels/2.4/v2.4.9-ac14/include/linux/poll.h Mon Sep 24 21:54:56 2001 +++ aio-v2.4.9-ac14.diff/include/linux/poll.h Tue Sep 25 14:09:31 2001 @@ -7,14 +7,25 @@ #include #include +#ifndef __LINUX__MM_H #include +#endif #include +#ifndef __LINUX__WORKTODO_H +#include +#endif struct poll_table_page; +struct kiocb; typedef struct poll_table_struct { - int error; - struct poll_table_page * table; + struct worktodo wtd; + int error; + struct poll_table_page *table; + struct kiocb *iocb; /* iocb for async poll */ + int events; /* event mask for async poll */ + int wake; + long sync; } poll_table; extern void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p); @@ -29,7 +40,9 @@ { pt->error = 0; pt->table = NULL; + pt->iocb = NULL; } + extern void poll_freewait(poll_table* pt); diff -urN /md0/kernels/2.4/v2.4.9-ac14/include/linux/sched.h aio-v2.4.9-ac14.diff/include/linux/sched.h --- /md0/kernels/2.4/v2.4.9-ac14/include/linux/sched.h Mon Sep 24 02:14:16 2001 +++ aio-v2.4.9-ac14.diff/include/linux/sched.h Tue Sep 25 14:09:20 2001 @@ -770,6 +770,7 @@ extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)); +extern void FASTCALL(add_wait_queue_exclusive_lifo(wait_queue_head_t *q, wait_queue_t * wait)); extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); #define __wait_event(wq, condition) \ diff -urN /md0/kernels/2.4/v2.4.9-ac14/include/linux/tqueue.h aio-v2.4.9-ac14.diff/include/linux/tqueue.h --- /md0/kernels/2.4/v2.4.9-ac14/include/linux/tqueue.h Mon Sep 24 21:54:51 2001 +++ aio-v2.4.9-ac14.diff/include/linux/tqueue.h Mon Sep 24 21:54:55 2001 @@ -67,6 +67,7 @@ #define TQ_ACTIVE(q) (!list_empty(&q)) extern task_queue tq_timer, tq_immediate, tq_disk; +extern struct tq_struct run_disk_tq; /* * To implement your own list of active bottom halfs, use the following diff -urN /md0/kernels/2.4/v2.4.9-ac14/include/linux/types.h aio-v2.4.9-ac14.diff/include/linux/types.h --- /md0/kernels/2.4/v2.4.9-ac14/include/linux/types.h Mon Sep 24 21:54:50 2001 +++ aio-v2.4.9-ac14.diff/include/linux/types.h Mon Sep 24 19:14:01 2001 @@ -127,4 +127,9 @@ char f_fpack[6]; }; +/* kernel typedefs -- they belong here. */ +#ifdef __KERNEL__ +typedef struct kvec_cb kvec_cb_t; +#endif /* __KERNEL__ */ + #endif /* _LINUX_TYPES_H */ diff -urN /md0/kernels/2.4/v2.4.9-ac14/include/linux/wait.h aio-v2.4.9-ac14.diff/include/linux/wait.h --- /md0/kernels/2.4/v2.4.9-ac14/include/linux/wait.h Mon Sep 24 02:14:16 2001 +++ aio-v2.4.9-ac14.diff/include/linux/wait.h Mon Sep 24 21:54:55 2001 @@ -28,17 +28,20 @@ #define WAITQUEUE_DEBUG 0 #endif +typedef struct __wait_queue wait_queue_t; +typedef void (*wait_queue_func_t)(wait_queue_t *wait); + struct __wait_queue { unsigned int flags; #define WQ_FLAG_EXCLUSIVE 0x01 struct task_struct * task; struct list_head task_list; + wait_queue_func_t func; #if WAITQUEUE_DEBUG long __magic; long __waker; #endif }; -typedef struct __wait_queue wait_queue_t; /* * 'dual' spinlock architecture. Can be switched between spinlock_t and @@ -137,6 +140,7 @@ #endif #define __WAITQUEUE_INITIALIZER(name, tsk) { \ + func: NULL, \ task: tsk, \ task_list: { NULL, NULL }, \ __WAITQUEUE_DEBUG_INIT(name)} @@ -174,6 +178,22 @@ #endif q->flags = 0; q->task = p; + q->func = NULL; +#if WAITQUEUE_DEBUG + q->__magic = (long)&q->__magic; +#endif +} + +static inline void init_waitqueue_func_entry(wait_queue_t *q, + wait_queue_func_t func) +{ +#if WAITQUEUE_DEBUG + if (!q || !p) + WQ_BUG(); +#endif + q->flags = 0; + q->task = NULL; + q->func = func; #if WAITQUEUE_DEBUG q->__magic = (long)&q->__magic; #endif @@ -231,6 +251,19 @@ list_del(&old->task_list); } +#define add_wait_queue_cond(q, wait, cond, fail) \ + do { \ + unsigned long flags; \ + wq_write_lock_irqsave(&(q)->lock, flags); \ + (wait)->flags = 0; \ + if (cond) \ + __add_wait_queue((q), (wait)); \ + else { \ + fail; \ + } \ + wq_write_unlock_irqrestore(&(q)->lock, flags); \ + } while (0) + #endif /* __KERNEL__ */ #endif diff -urN /md0/kernels/2.4/v2.4.9-ac14/include/linux/worktodo.h aio-v2.4.9-ac14.diff/include/linux/worktodo.h --- /md0/kernels/2.4/v2.4.9-ac14/include/linux/worktodo.h Wed Dec 31 19:00:00 1969 +++ aio-v2.4.9-ac14.diff/include/linux/worktodo.h Mon Sep 24 21:54:56 2001 @@ -0,0 +1,39 @@ +#ifndef __LINUX__WORKTODO_H +#define __LINUX__WORKTODO_H + +#ifndef _LINUX_WAIT_H +#include +#endif +#ifndef _LINUX_TQUEUE_H +#include +#endif + +struct worktodo { + wait_queue_t wait; + struct tq_struct tq; + + void *data; /* for use by the wtd_ primatives */ +}; + +/* FIXME NOTE: factor from kernel/context.c */ +#define wtd_queue(wtd) schedule_task(&(wtd)->tq) + +#define wtd_set_action(wtd, action, wtddata) \ + do { \ + (wtd)->tq.routine = (action); \ + (wtd)->tq.data = (wtddata); \ + } while (0) + +struct page; +extern void wtd_wait_page(struct worktodo *wtd, struct page *page); +extern void wtd_lock_page(struct worktodo *wtd, struct page *page); +struct buffer_head; +extern void wtd_wait_on_buffer(struct worktodo *wtd, struct buffer_head *bh); + +#if 0 /* not implemented yet */ +extern void wtd_down(struct worktodo *wtd, struct semaphore *sem); +extern void wtd_down_write(struct worktodo *wtd, struct rw_semaphore *sem); +extern void wtd_down_read(struct worktodo *wtd, struct rw_semaphore *sem); +#endif + +#endif /* __LINUX__WORKTODO_H */ diff -urN /md0/kernels/2.4/v2.4.9-ac14/include/net/sock.h aio-v2.4.9-ac14.diff/include/net/sock.h --- /md0/kernels/2.4/v2.4.9-ac14/include/net/sock.h Mon Sep 24 02:14:16 2001 +++ aio-v2.4.9-ac14.diff/include/net/sock.h Tue Sep 25 14:09:31 2001 @@ -692,6 +692,10 @@ (__skb)->next = NULL; \ } while(0) +struct kioctx; +struct iocb; +#include /* FIXME */ + /* IP protocol blocks we attach to sockets. * socket layer -> transport layer interface * transport -> network interface is defined by struct inet_proto @@ -721,6 +725,8 @@ int (*recvmsg)(struct sock *sk, struct msghdr *msg, int len, int noblock, int flags, int *addr_len); + int (*begin_read)(struct sock *, struct kioctx *, + struct iocb, struct iocb *); int (*bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len); diff -urN /md0/kernels/2.4/v2.4.9-ac14/kernel/context.c aio-v2.4.9-ac14.diff/kernel/context.c --- /md0/kernels/2.4/v2.4.9-ac14/kernel/context.c Fri May 25 22:48:10 2001 +++ aio-v2.4.9-ac14.diff/kernel/context.c Mon Sep 24 19:09:13 2001 @@ -91,12 +91,18 @@ */ for (;;) { set_task_state(curtask, TASK_INTERRUPTIBLE); - add_wait_queue(&context_task_wq, &wait); - if (TQ_ACTIVE(tq_context)) + add_wait_queue_exclusive_lifo(&context_task_wq, &wait); + if (spin_is_locked(&tqueue_lock) || TQ_ACTIVE(tq_context)) set_task_state(curtask, TASK_RUNNING); - schedule(); + else + schedule(); remove_wait_queue(&context_task_wq, &wait); run_task_queue(&tq_context); + while (TQ_ACTIVE(tq_context)) { + if (current->need_resched) + schedule(); + run_task_queue(&tq_context); + } wake_up(&context_task_done); if (signal_pending(curtask)) { while (waitpid(-1, (unsigned int *)0, __WALL|WNOHANG) > 0) diff -urN /md0/kernels/2.4/v2.4.9-ac14/kernel/fork.c aio-v2.4.9-ac14.diff/kernel/fork.c --- /md0/kernels/2.4/v2.4.9-ac14/kernel/fork.c Mon Sep 24 02:14:16 2001 +++ aio-v2.4.9-ac14.diff/kernel/fork.c Mon Sep 24 19:09:13 2001 @@ -46,6 +46,16 @@ wq_write_unlock_irqrestore(&q->lock, flags); } +void add_wait_queue_exclusive_lifo(wait_queue_head_t *q, wait_queue_t * wait) +{ + unsigned long flags; + + wq_write_lock_irqsave(&q->lock, flags); + wait->flags = WQ_FLAG_EXCLUSIVE; + __add_wait_queue(q, wait); + wq_write_unlock_irqrestore(&q->lock, flags); +} + void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait) { unsigned long flags; diff -urN /md0/kernels/2.4/v2.4.9-ac14/kernel/sched.c aio-v2.4.9-ac14.diff/kernel/sched.c --- /md0/kernels/2.4/v2.4.9-ac14/kernel/sched.c Mon Sep 24 02:14:16 2001 +++ aio-v2.4.9-ac14.diff/kernel/sched.c Mon Sep 24 19:09:13 2001 @@ -714,13 +714,13 @@ } /* - * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just wake everything - * up. If it's an exclusive wakeup (nr_exclusive == small +ve number) then we wake all the - * non-exclusive tasks and one exclusive task. + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small + * +ve number) then we wake all the non-exclusive tasks and one exclusive task. * * There are circumstances in which we can try to wake a task which has already - * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns zero - * in this (rare) case, and we handle it by contonuing to scan the queue. + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns + * zero in this (rare) case, and we handle it by contonuing to scan the queue. */ static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode, int nr_exclusive, const int sync) @@ -733,14 +733,25 @@ list_for_each(tmp,&q->task_list) { unsigned int state; - wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); + wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); + wait_queue_func_t func; CHECK_MAGIC(curr->__magic); + func = curr->func; + if (func) { + unsigned flags = curr->flags; + func(curr); + if ((flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) + break; + continue; + } p = curr->task; state = p->state; if (state & mode) { WQ_NOTE_WAKER(curr); - if (try_to_wake_up(p, sync) && (curr->flags&WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) + if (try_to_wake_up(p, sync) && + (curr->flags & WQ_FLAG_EXCLUSIVE) && + !--nr_exclusive) break; } } diff -urN /md0/kernels/2.4/v2.4.9-ac14/kernel/softirq.c aio-v2.4.9-ac14.diff/kernel/softirq.c --- /md0/kernels/2.4/v2.4.9-ac14/kernel/softirq.c Mon Sep 24 02:14:16 2001 +++ aio-v2.4.9-ac14.diff/kernel/softirq.c Mon Sep 24 19:09:13 2001 @@ -354,6 +354,7 @@ data = p->data; wmb(); p->sync = 0; + smp_mb(); if (f) f(data); } Binary files /md0/kernels/2.4/v2.4.9-ac14/mm/.filemap.c.swp and aio-v2.4.9-ac14.diff/mm/.filemap.c.swp differ diff -urN /md0/kernels/2.4/v2.4.9-ac14/mm/filemap.c aio-v2.4.9-ac14.diff/mm/filemap.c --- /md0/kernels/2.4/v2.4.9-ac14/mm/filemap.c Mon Sep 24 02:14:16 2001 +++ aio-v2.4.9-ac14.diff/mm/filemap.c Tue Sep 25 21:59:56 2001 @@ -22,12 +22,14 @@ #include #include #include +#include #include #include #include #include +#include /* * Shared mappings implemented 30.11.1994. It's not fully working yet, @@ -2512,7 +2514,6 @@ */ struct page *grab_cache_page_nowait(struct address_space *mapping, unsigned long index) { - struct page *cached_page = NULL; struct page *page, **hash; hash = page_hash(mapping, index); @@ -2799,3 +2800,713 @@ panic("Failed to allocate page hash table\n"); memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *)); } + +/* address_space_map + * Maps a series of pages from the page cache into the given array. + */ +static int address_space_map(struct address_space *as, unsigned long index, + int nr, struct page **pages, + int *nr_newp, struct page **new_pages) +{ + struct page *cached_page = NULL; + int nr_new = 0; + int ret; + + ret = -EINVAL; + if (nr <= 0) + goto out; + + ret = 0; + + spin_lock(&pagecache_lock); + + while (nr > 0) { + struct page **hash = page_hash(as, index); + struct page *page; + + page = __find_page_nolock(as, index, *hash); + if (page) { + page_cache_get(page); +got_page: + pages[ret++] = page; + index++; + nr--; + continue; + } + + if (cached_page) { + __add_to_page_cache(cached_page, as, index, hash); + nr_new++; + *new_pages++ = page = cached_page; + cached_page = NULL; + goto got_page; + } + spin_unlock(&pagecache_lock); + + cached_page = page_cache_alloc(as); + if (!cached_page) + goto out; + + /* Okay, we now have an allocated page. Retry + * the search and add. */ + spin_lock(&pagecache_lock); + } + + spin_unlock(&pagecache_lock); + +out: + if (cached_page) + page_cache_free(cached_page); + + *nr_newp = nr_new; + return ret ? ret : -ENOMEM; +} + +struct iodesc { + struct worktodo wtd; + + struct page *good_page; /* the highest Uptodate page */ + int good_idx; + int err; + int did_read; + int rw; + + struct page **pages; + struct page **new_pages; + struct page **cur_pagep; + int nr_pages; + int nr_new_pages; + + struct address_space *as; + struct file *file; + kvec_cb_t cb; + + size_t size; + unsigned long transferred; + unsigned offset; + struct kveclet *veclet; + + int sync; + +#define READDESC_NR_DEF 3 + struct page *def_pages[READDESC_NR_DEF]; + struct page *def_new_pages[READDESC_NR_DEF]; +}; + +static void __iodesc_free(struct iodesc *io, int unlock) +{ + kvec_cb_t cb; + ssize_t res; + + if (unlock) { + unsigned i; + for (i=0; inr_pages; i++) { + struct page *page = io->pages[i]; + UnlockPage(page); + deactivate_page(page); + page_cache_release(page); + } + } else { + unsigned i; + for (i=0; inr_pages; i++) + page_cache_release(io->pages[i]); + } + + if (io->new_pages != io->def_new_pages) + kfree(io->new_pages); + if (io->pages != io->def_pages) + kfree(io->pages); + + cb = io->cb; + res = io->transferred ? io->transferred : io->err; + kfree(io); + + cb.fn(cb.data, cb.vec, res); +} + +/* By the time this function is called, all of the pages prior to + * the current good_idx have been released appropriately. The remaining + * duties are to release any remaining pages and to honour O_SYNC. + */ +static void __iodesc_finish_write(struct iodesc *io) +{ + pr_debug("__iodesc_finish_write(%p)\n", io); + + __iodesc_free(io, WRITE == io->rw); +} + +/* This is mostly ripped from generic_file_write */ +static int __iodesc_write_page(struct iodesc *io, struct page *page) +{ + unsigned long bytes; + unsigned long offset, src_offset; + struct page *src_page; + long status; + char *kaddr; + int src_bytes; + char *src; + int done = 0; + unsigned left; + + src_page = io->veclet->page; + src_bytes = io->veclet->length; + src_offset = io->veclet->offset; + src = kmap(src_page) + src_offset; + + offset = io->offset; + kaddr = kmap(page); + kaddr += offset; + + bytes = PAGE_CACHE_SIZE - offset; + if (io->size < bytes) + bytes = io->size; + + pr_debug("__iodesc_write_page(%p (%lu), %lu %lu %lu)\n", page, page->index, offset, bytes, src_offset); + + io->err = io->as->a_ops->prepare_write(io->file, page, + offset, offset + bytes); + if (io->err) { +printk("prepare_write: %d\n", io->err); + goto unlock; + } + + left = bytes; + for (;;) { + unsigned this = src_bytes; + if (left < this) + this = left; + + memcpy(kaddr, src, this); + kaddr += this; + src += this; + left -= this; + src_bytes -= this; + src_offset += this; + + if (left <= 0) + break; + + if (!src_bytes) { + io->veclet++; + kunmap(src_page); + src_page = io->veclet->page; + src_bytes = io->veclet->length; + src_offset = io->veclet->offset; + src = kmap(src_page) + src_offset; + } + } + flush_dcache_page(page); + status = io->as->a_ops->commit_write(io->file, page, + offset, offset+bytes); + + /* We don't handle short writes */ + if (status > 0 && status != bytes) + done = 1; + + if (!status) + status = bytes; + else + printk("commit_write: %ld\n", status); + + if (status > 0) { + io->transferred += status; + io->size -= status; + io->offset = (offset + status) & (PAGE_CACHE_SIZE - 1); + + if (io->offset) + done = 1; + + src_offset += status; + src_offset &= PAGE_CACHE_SIZE - 1; + } else { + io->err = status; + done = 1; + } + +unlock: + kunmap(page); + kunmap(src_page); + + //UnlockPage(page); + //deactivate_page(page); + //page_cache_release(page); + + return done; +} + +void __iodesc_sync_wait_page(void *data) +{ + struct iodesc *io = data; + + do { + struct buffer_head *bh, *head = io->pages[io->good_idx]->buffers; + + if (!head) + continue; + + bh = head; + do { + if (buffer_locked(bh)) { + pr_debug("waiting on bh=%pi io=%p\n", bh, io); + wtd_wait_on_buffer(&io->wtd, bh); + return; + } + if (buffer_req(bh) && !buffer_uptodate(bh)) { + pr_debug("io err bh=%p (%p)\n", bh, io); + io->err = -EIO; + break; + } + } while ((bh = bh->b_this_page) != head); + } while (!io->err && ++io->good_idx < io->nr_pages) ; + + pr_debug("finish_write(%p)\n", io); + __iodesc_finish_write(io); +} + +static void __iodesc_do_write(void *data) +{ + struct iodesc *io = data; + unsigned i; + + up(&io->file->f_dentry->d_inode->i_sem); + + for (i=0; inr_pages; i++) + if (__iodesc_write_page(io, io->pages[i])) + break; + + if (io->sync) { + io->good_idx = 0; + + pr_debug("writing out pages(%p)\n", io); + for (i=0; inr_pages; i++) { + if (io->pages[i]->buffers) + writeout_one_page(io->pages[i]); + } + + pr_debug("calling __iodesc_sync_wait_page(%p)\n", io); + wtd_set_action(&io->wtd, __iodesc_sync_wait_page, io); + __iodesc_sync_wait_page(io); + return; + } + + __iodesc_finish_write(io); +} + +static void __iodesc_write_lock_next_page(void *data) +{ + struct iodesc *io = data; + pr_debug("__iodesc_write_next_page(%p)\n", io); + + while (io->good_idx < io->nr_pages) { + io->good_page = io->pages[io->good_idx++]; + if (io->good_page == *io->cur_pagep) + io->cur_pagep++; + else { + wtd_lock_page(&io->wtd, io->good_page); + return; + } + } + + //Is this faster? __iodesc_do_write(io); + wtd_set_action(&io->wtd, __iodesc_do_write, io); + wtd_queue(&io->wtd); +} + +static void __generic_file_write_iodesc(struct iodesc *io) +{ + struct inode *inode = io->file->f_dentry->d_inode; + time_t now = CURRENT_TIME; + + remove_suid(inode); + if (inode->i_ctime != now || inode->i_mtime != now) { + inode->i_ctime = inode->i_mtime = now; + mark_inode_dirty_sync(inode); + } + + wtd_set_action(&io->wtd, __iodesc_write_lock_next_page, io); + io->sync = !!(io->file->f_flags & O_SYNC); + io->good_idx = 0; + io->cur_pagep = io->new_pages; + __iodesc_write_lock_next_page(io); +} + +static void __iodesc_read_finish(struct iodesc *io) +{ + struct page **src_pagep; + char *dst_addr, *src_addr; + int src_off; + size_t size; + size_t valid; + + struct kveclet *veclet = io->veclet; + struct page *dst_page = veclet->page; + int dst_len = veclet->length; + int dst_off = veclet->offset; + + + pr_debug("__iodesc_read_finish: good_idx = %d\n", io->good_idx); + if (io->good_idx <= 0) + goto no_data; + + size = io->size; + src_off = io->offset; + src_pagep = io->pages; + src_addr = kmap(*src_pagep); + + valid = (size_t)io->good_idx << PAGE_CACHE_SHIFT; + valid -= src_off; + pr_debug("size=%d valid=%d src_off=%d\n", size, valid, src_off); + + if (valid < size) + size = valid; + + dst_addr = kmap(veclet->page); + + while (size > 0) { + int this = PAGE_CACHE_SIZE - src_off; + if ((PAGE_SIZE - dst_off) < this) + this = PAGE_SIZE - dst_off; + if (size < this) + this = size; + pr_debug("this=%d src_off=%d dst_off=%d dst_len=%d\n", + this, src_off, dst_off, dst_len); + memcpy(dst_addr + dst_off, src_addr + src_off, this); + + src_off += this; + dst_off += this; + dst_len -= this; + size -= this; + io->transferred += this; + pr_debug("read_finish: this=%d transferred=%d\n", + this, io->transferred); + + if (size <= 0) + break; + + if (dst_len <= 0) { + kunmap(dst_page); + veclet++; + dst_page = veclet->page; + dst_off = veclet->offset; + dst_len = veclet->length; + dst_addr = kmap(dst_page); + } + + if (src_off >= PAGE_SIZE) { /* FIXME: PAGE_CACHE_SIZE */ + kunmap(*src_pagep); + pr_debug("page(%lu)->count = %d\n", + (*src_pagep)->index, + atomic_read(&(*src_pagep)->count)); + src_pagep++; + src_addr = kmap(*src_pagep); + src_off = 0; + } + } + kunmap(dst_page); + kunmap(*src_pagep); +no_data: + __iodesc_free(io, 0); +} + +static void __iodesc_make_uptodate(void *data) +{ + struct iodesc *io = data; + struct page *page = io->good_page; + int locked = 1; + + pr_debug("__iodesc_make_uptodate: io=%p index=%lu\n", io, page->index); + while (Page_Uptodate(page)) { +again: + pr_debug("page index %lu uptodate\n", page->index); + if (locked) { + UnlockPage(page); + locked = 0; + } + io->did_read = 0; + io->good_idx++; + if (io->good_idx >= io->nr_pages) { + __iodesc_read_finish(io); + return; + } + page = io->good_page = io->pages[io->good_idx]; + pr_debug("__iodesc_make_uptodate: index=%lu\n", page->index); + } + + if (!locked) { + wtd_lock_page(&io->wtd, page); + return; + } + + if (!io->did_read) { + /* We haven't tried reading this page before, give it a go. */ + printk("attempting to read %lu\n", page->index); + io->did_read = 1; + io->err = page->mapping->a_ops->readpage(io->file, page); + if (!io->err) { + if (Page_Uptodate(page)) + goto again; + wtd_lock_page(&io->wtd, page); + return; + } + } + + if (locked) + UnlockPage(page); + + /* We've already read this page before. Set err to EIO and quite */ + if (!io->err) + io->err = -EIO; + __iodesc_read_finish(io); +} + +static void __wtdgeneric_file_read_iodesc(void *data); + +static void __generic_file_read_iodesc(struct iodesc *io, int mayblock) +{ + int (*readpage)(struct file *, struct page *); + int i; + + wtd_set_action(&io->wtd, __iodesc_make_uptodate, io); + readpage = io->as->a_ops->readpage; + for (i=0; inr_new_pages; i++) { + int ret; + if (!mayblock) { + static int zoo; if (zoo++ < 5) printk("read sleep\n"); + wtd_set_action(&io->wtd, __wtdgeneric_file_read_iodesc, io); + wtd_queue(&io->wtd); + } + ret = readpage(io->file, io->new_pages[i]); + if (ret) + printk(KERN_DEBUG "__generic_file_read_kiovec: readpage(%lu) = %d\n", io->new_pages[i]->index, ret); + } + + for (i=0; inr_pages; i++) { + struct page *page = io->pages[i]; + if (Page_Uptodate(page)) { + pr_debug("__generic_file_read_iodesc: %lu is uptodate\n", page->index); + continue; + } + + if (!mayblock) { + static int zoo; if (zoo++ < 5) printk("read sleep\n"); + wtd_set_action(&io->wtd, __wtdgeneric_file_read_iodesc, io); + wtd_queue(&io->wtd); + } + if (!TryLockPage(page)) { + int ret = readpage(io->file, page); + if (ret) + printk(KERN_DEBUG "__generic_file_read_iodesc: readpage(%lu): %d\n", page->index, ret); + } + + if (!Page_Uptodate(page) && io->good_idx == -1) { + pr_debug("first good_idx=%d (%lu)\n", i, page->index); + io->good_idx = i; + io->good_page = page; + } + } + + /* Whee, all the pages are uptodate! */ + if (!io->good_page) { + static int zoo; if (!mayblock && zoo++ < 5) printk("all uptodate\n"); + pr_debug("all pages uptodate!\n"); + io->good_idx = io->nr_pages; + __iodesc_read_finish(io); + return; + } + + pr_debug("locking good_page\n"); + wtd_lock_page(&io->wtd, io->good_page); + return; +} + +static void __wtdgeneric_file_read_iodesc(void *data) +{ + struct iodesc *io = data; + __generic_file_read_iodesc(io, 1); +} + +static int generic_file_rw_kvec(struct file *file, int rw, kvec_cb_t cb, + size_t size, loff_t pos); + +int generic_file_kvec_read(struct file *file, kvec_cb_t cb, size_t size, loff_t pos) +{ + return generic_file_rw_kvec(file, READ, cb, size, pos); +} + +int generic_file_kvec_write(struct file *file, kvec_cb_t cb, size_t size, loff_t pos) +{ + return generic_file_rw_kvec(file, WRITE, cb, size, pos); +} + +int generic_file_rw_kvec(struct file *file, int rw, kvec_cb_t cb, + size_t size, loff_t pos) +{ + struct inode *inode = file->f_dentry->d_inode; + struct address_space *as = inode->i_mapping; + unsigned long index; + unsigned long eindex; + unsigned long nr_pages; + struct iodesc *io = NULL; + int ret; + + ret = -EINVAL; + if (rw != READ && rw != WRITE) + goto out; + + ret = -ENOMEM; + io = kmalloc(sizeof(*io), GFP_KERNEL); + if (!io) + goto out; + + memset(io, 0, sizeof(*io)); + io->size = size; + + if (READ == rw) { + pr_debug("pos=%Ld i_size=%Ld\n", pos, inode->i_size); + + if (pos > inode->i_size) + size = 0; + else if ((pos + size) > inode->i_size) + size = inode->i_size - pos; + + if (io->size < size) + size = io->size; + else if (size < io->size) + io->size = size; + + pr_debug("io->size=%d size=%d\n", io->size, size); + } + + index = pos >> PAGE_CACHE_SHIFT; + eindex = (pos + size - 1) >> PAGE_CACHE_SHIFT; + nr_pages = eindex - index + 1; + + pr_debug("nr_pages: %lu\n", nr_pages); + + io->good_idx = -1; + io->good_page = NULL; + io->did_read = 0; + io->err = 0; + io->rw = rw; + io->as = as; + io->offset = (unsigned long)pos & (PAGE_CACHE_SIZE - 1); + io->file = file; + io->cb = cb; + io->veclet = cb.vec->veclet; + if (nr_pages < READDESC_NR_DEF) { + io->pages = io->def_pages; + io->new_pages = io->def_new_pages; + } else { + io->pages = kmalloc(sizeof(*io->pages) * (nr_pages + 1), GFP_KERNEL); + if (!io->pages) + goto out_io; + + io->new_pages = kmalloc(sizeof(*io->new_pages) * (nr_pages + 1), GFP_KERNEL); + if (!io->new_pages) + goto out_pages; + } + + /* FIXME: make the down a WTD_op */ + if (rw == WRITE) + down(&io->file->f_dentry->d_inode->i_sem); + + ret = address_space_map(as, index, nr_pages, io->pages, + &io->nr_new_pages, io->new_pages); + pr_debug("as_map: %d (%d new)\n", ret, io->nr_new_pages); + if (ret <= 0) + goto out_new_pages; + + io->nr_pages = ret; + io->pages[io->nr_pages] = NULL; + io->new_pages[io->nr_new_pages] = NULL; + + if (rw == READ) + __generic_file_read_iodesc(io, 0); + else if (rw == WRITE) + __generic_file_write_iodesc(io); + + return 0; + +out_new_pages: + if (io->new_pages != io->def_new_pages) + kfree(io->new_pages); +out_pages: + if (io->pages != io->def_pages) + kfree(io->pages); +out_io: + kfree(io); +out: + return ret; +} + +static void __wtd_lock_page_waiter(wait_queue_t *wait) +{ + struct worktodo *wtd = (struct worktodo *)wait; + struct page *page = (struct page *)wtd->data; + + if (!TryLockPage(page)) { + __remove_wait_queue(&page->wait, &wtd->wait); + wtd_queue(wtd); + } else { + schedule_task(&run_disk_tq); + } +} + +void wtd_lock_page(struct worktodo *wtd, struct page *page) +{ + if (TryLockPage(page)) { + int raced = 0; + wtd->data = page; + init_waitqueue_func_entry(&wtd->wait, __wtd_lock_page_waiter); + add_wait_queue_cond(&page->wait, &wtd->wait, TryLockPage(page), raced = 1); + + if (!raced) { + run_task_queue(&tq_disk); + return; + } + } + + wtd->tq.routine(wtd->tq.data); +} + +static void __wtd_bh_waiter(wait_queue_t *wait) +{ + struct worktodo *wtd = (struct worktodo *)wait; + struct buffer_head *bh = (struct buffer_head *)wtd->data; + + if (!buffer_locked(bh)) { + __remove_wait_queue(&bh->b_wait, &wtd->wait); + wtd_queue(wtd); + } else { + schedule_task(&run_disk_tq); + } +} + +void wtd_wait_on_buffer(struct worktodo *wtd, struct buffer_head *bh) +{ + int raced = 0; + + if (!buffer_locked(bh)) { + wtd->tq.routine(wtd->tq.data); + return; + } + wtd->data = bh; + init_waitqueue_func_entry(&wtd->wait, __wtd_bh_waiter); + add_wait_queue_cond(&bh->b_wait, &wtd->wait, buffer_locked(bh), raced = 1); + + if (raced) + wtd->tq.routine(wtd->tq.data); + else + run_task_queue(&tq_disk); +} + +void do_run_tq_disk(void *data) +{ + run_task_queue(&tq_disk); +} + +struct tq_struct run_disk_tq = { + routine: do_run_tq_disk, + data: NULL +}; + diff -urN /md0/kernels/2.4/v2.4.9-ac14/mm/memory.c aio-v2.4.9-ac14.diff/mm/memory.c --- /md0/kernels/2.4/v2.4.9-ac14/mm/memory.c Mon Sep 24 02:14:16 2001 +++ aio-v2.4.9-ac14.diff/mm/memory.c Mon Sep 24 21:36:43 2001 @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -1469,3 +1470,135 @@ } while (addr < end); return 0; } + +/* + * Force in an entire range of pages from the current process's user VA, + * and pin them in physical memory. + */ + +#define dprintk(x...) +struct kvec *map_user_kvec(int rw, unsigned long ptr, size_t len) +{ + struct kvec *vec; + struct kveclet *veclet; + unsigned long end; + int err; + struct mm_struct * mm; + struct vm_area_struct * vma = 0; + int i; + int datain = (rw == READ); + unsigned nr_pages; + + end = ptr + len; + if (end < ptr) { + printk(KERN_DEBUG "map_user_kvec: end < ptr\n"); + return ERR_PTR(-EINVAL); + } + + nr_pages = (ptr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + nr_pages -= ptr >> PAGE_SHIFT; + vec = kmalloc(sizeof(struct kvec) + nr_pages * sizeof(struct kveclet), + GFP_KERNEL); + if (!vec) + return ERR_PTR(-ENOMEM); + vec->nr = 0; + vec->max_nr = nr_pages; + veclet = vec->veclet; + + /* Make sure the iobuf is not already mapped somewhere. */ + mm = current->mm; + dprintk ("map_user_kiobuf: begin\n"); + + down_read(&mm->mmap_sem); + + err = -EFAULT; + + i = 0; + + /* + * First of all, try to fault in all of the necessary pages + */ + while (ptr < end) { + struct page *map; + veclet->offset = ptr & ~PAGE_MASK; + veclet->length = PAGE_SIZE - veclet->offset; + if (len < veclet->length) + veclet->length = len; + ptr &= PAGE_MASK; + + if (!vma || ptr >= vma->vm_end) { + vma = find_vma(current->mm, ptr); + if (!vma) + goto out_unlock; + if (vma->vm_start > ptr) { + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto out_unlock; + if (expand_stack(vma, ptr)) + goto out_unlock; + } + if (((datain) && (!(vma->vm_flags & VM_WRITE))) || + (!(vma->vm_flags & VM_READ))) { + err = -EACCES; + goto out_unlock; + } + } + spin_lock(&mm->page_table_lock); + while (!(map = follow_page(ptr, datain))) { + int ret; + + spin_unlock(&mm->page_table_lock); + ret = handle_mm_fault(current->mm, vma, ptr, datain); + if (ret <= 0) { + if (!ret) + goto out_unlock; + else { + err = -ENOMEM; + goto out_unlock; + } + } + spin_lock(&mm->page_table_lock); + } + map = get_page_map(map); + if (map) { + flush_dcache_page(map); + atomic_inc(&map->count); + } else + printk (KERN_INFO "Mapped page missing [%d]\n", i); + spin_unlock(&mm->page_table_lock); + veclet->page = map; + veclet++; + + ptr += PAGE_SIZE; + len -= PAGE_SIZE; + vec->nr = ++i; + } + + up_read(&mm->mmap_sem); + dprintk ("map_user_kiobuf: end OK\n"); + return vec; + + out_unlock: + up_read(&mm->mmap_sem); + unmap_kvec(vec); + printk(KERN_DEBUG "map_user_kvec: err(%d)\n", err); + kfree(vec); + return ERR_PTR(err); +} + +/* + * Unmap all of the pages referenced by a kiobuf. We release the pages, + * and unlock them if they were locked. + */ + +void unmap_kvec (struct kvec *vec) +{ + struct kveclet *veclet, *end = vec->veclet + vec->nr; + + for (veclet=vec->veclet; vecletpage; + if (map) + __free_page(map); + } + + vec->nr = 0; +} diff -urN /md0/kernels/2.4/v2.4.9-ac14/net/ipv4/af_inet.c aio-v2.4.9-ac14.diff/net/ipv4/af_inet.c --- /md0/kernels/2.4/v2.4.9-ac14/net/ipv4/af_inet.c Mon Aug 13 15:12:09 2001 +++ aio-v2.4.9-ac14.diff/net/ipv4/af_inet.c Mon Sep 24 19:09:13 2001 @@ -732,6 +732,13 @@ } +int inet_begin_read(struct socket *sock, struct kioctx *ctx, + struct iocb iocb, struct iocb *iocbptr) +{ + struct sock *sk = sock->sk; + + return sk->prot->begin_read(sk, ctx, iocb, iocbptr); +} int inet_recvmsg(struct socket *sock, struct msghdr *msg, int size, int flags, struct scm_cookie *scm) @@ -973,6 +980,7 @@ recvmsg: inet_recvmsg, mmap: sock_no_mmap, sendpage: sock_no_sendpage, + begin_read: inet_begin_read, }; struct net_proto_family inet_family_ops = { diff -urN /md0/kernels/2.4/v2.4.9-ac14/net/ipv4/udp.c aio-v2.4.9-ac14.diff/net/ipv4/udp.c --- /md0/kernels/2.4/v2.4.9-ac14/net/ipv4/udp.c Mon Sep 24 02:14:16 2001 +++ aio-v2.4.9-ac14.diff/net/ipv4/udp.c Mon Sep 24 19:09:13 2001 @@ -619,6 +619,15 @@ __udp_checksum_complete(skb); } + +static int udp_begin_read(struct sock *sk, struct kioctx *ctx, + struct iocb iocb, struct iocb *iocbptr) +{ + struct sk_buff *skb; + printk("udp_begin_read\n"); +} + + /* * This should be easy, if there is something there we * return it, otherwise we block. @@ -1016,6 +1025,7 @@ getsockopt: ip_getsockopt, sendmsg: udp_sendmsg, recvmsg: udp_recvmsg, + begin_read: udp_begin_read, backlog_rcv: udp_queue_rcv_skb, hash: udp_v4_hash, unhash: udp_v4_unhash, diff -urN /md0/kernels/2.4/v2.4.9-ac14/net/socket.c aio-v2.4.9-ac14.diff/net/socket.c --- /md0/kernels/2.4/v2.4.9-ac14/net/socket.c Mon Sep 24 02:14:17 2001 +++ aio-v2.4.9-ac14.diff/net/socket.c Mon Sep 24 19:09:13 2001 @@ -107,6 +107,8 @@ unsigned long count, loff_t *ppos); static ssize_t sock_sendpage(struct file *file, struct page *page, int offset, size_t size, loff_t *ppos, int more); +static int sock_begin_read(struct file *file, struct kioctx *ctx, + struct iocb iocb, struct iocb *iocbptr); /* @@ -126,6 +128,7 @@ fasync: sock_fasync, readv: sock_readv, writev: sock_writev, + //begin_read: sock_begin_read, sendpage: sock_sendpage }; @@ -623,6 +626,16 @@ return sock->ops->sendpage(sock, page, offset, size, flags); } +static int sock_begin_read(struct file *file, struct kioctx *ctx, + struct iocb iocb, struct iocb *iocbptr) +{ + struct socket *sock; + sock = socki_lookup(file->f_dentry->d_inode); + if (sock->ops->begin_read) + sock->ops->begin_read(sock, ctx, iocb, iocbptr); + return -EINVAL; +} + int sock_readv_writev(int type, struct inode * inode, struct file * file, const struct iovec * iov, long count, long size) {