# These are AIO changes needed by the ubd driver and humfs. # One major problem fixed here is -EAGAIN handling. It is not enough # to simply pass the error back to the driver so it can retry later. # The ubd driver retries in its interrupt routine, the theory being # that it knows that some requests have been finished on the host, so # there is room to queue some more. The problem is that all the # host AIO requests may be humfs requests, in which case the ubd # interrupt handler will never be called, since it had nothing # pending. # # The solution is to centralize the kernel side of AIO request # completion. Rather than have the aio thread send finished requests # directly to the driver which submitted them, they now go to a new # AIO IRQ handler, which sends finished requests to the appropriate # driver. When the host says -EAGAIN, the driver registers a restart # handler with the AIO subsystem. When a host AIO request finishes, # the AIO IRQ handler calls all registered restart handlers. So, a # driver will be notified when new requests can be queued, even if # it's not the one which clogged up the host queue in the first place. # # This required a bunch of changes in the ubd driver and humfs. The # interrupt routines are drastically different, as they are no longer # directly called from the IRQ system. They take a list of completed # requests and finish them off. They register a structure at boot # time with the AIO subsystem. This is used as the start of the list # of completed requests. When there are requests mixed together from # different drivers, they need to be separated into different lists, # and the aio_driver is used for this. # # aio_thread_reply is gone, as the err field was redundant. Once it's # gone, the aio_context is all that's left, so we might as well just # write aio_contexts between the kernel and aio thread. # # UBB_IRQ and HUMFS_IRQ are no more, being replaced by AIO_IRQ. # These are AIO changes needed by the ubd driver and humfs. # One major problem fixed here is -EAGAIN handling. It is not enough # to simply pass the error back to the driver so it can retry later. # The ubd driver retries in its interrupt routine, the theory being # that it knows that some requests have been finished on the host, so # there is room to queue some more. The problem is that all the # host AIO requests may be humfs requests, in which case the ubd # interrupt handler will never be called, since it had nothing # pending. # # The solution is to centralize the kernel side of AIO request # completion. Rather than have the aio thread send finished requests # directly to the driver which submitted them, they now go to a new # AIO IRQ handler, which sends finished requests to the appropriate # driver. When the host says -EAGAIN, the driver registers a restart # handler with the AIO subsystem. When a host AIO request finishes, # the AIO IRQ handler calls all registered restart handlers. So, a # driver will be notified when new requests can be queued, even if # it's not the one which clogged up the host queue in the first place. # # This required a bunch of changes in the ubd driver and humfs. The # interrupt routines are drastically different, as they are no longer # directly called from the IRQ system. They take a list of completed # requests and finish them off. They register a structure at boot # time with the AIO subsystem. This is used as the start of the list # of completed requests. When there are requests mixed together from # different drivers, they need to be separated into different lists, # and the aio_driver is used for this. # # aio_thread_reply is gone, as the err field was redundant. Once it's # gone, the aio_context is all that's left, so we might as well just # write aio_contexts between the kernel and aio thread. # # UBB_IRQ and HUMFS_IRQ are no more, being replaced by AIO_IRQ. Index: linux-2.6.17/arch/um/include/aio.h =================================================================== --- linux-2.6.17.orig/arch/um/include/aio.h 2007-11-19 11:58:13.000000000 -0500 +++ linux-2.6.17/arch/um/include/aio.h 2008-01-07 12:51:48.000000000 -0500 @@ -1,5 +1,5 @@ /* - * Copyright (C) 2004 Jeff Dike (jdike@karaya.com) + * Copyright (C) 2004 - 2006 Jeff Dike (jdike@addtoit.com) * Licensed under the GPL */ @@ -8,21 +8,29 @@ enum aio_type { AIO_READ, AIO_WRITE, AIO_MMAP }; -struct aio_thread_reply { - void *data; - int err; -}; - struct aio_context { - int reply_fd; + enum aio_type type; + int fd; + void *data; + int len; + unsigned long long offset; + void *driver; struct aio_context *next; }; -#define INIT_AIO_CONTEXT { .reply_fd = -1, \ - .next = NULL } - -extern int submit_aio(enum aio_type type, int fd, char *buf, int len, - unsigned long long offset, int reply_fd, - struct aio_context *aio); +#define INIT_AIO(aio_type, aio_fd, aio_data, aio_len, aio_offset, aio_driver) \ + { .type = aio_type, \ + .fd = aio_fd, \ + .data = aio_data, \ + .len = aio_len, \ + .offset = aio_offset, \ + .driver = aio_driver, \ + .next = NULL } + +extern int submit_aio(struct aio_context *aio); + +/* Declared here instead of aio-restarts.h because that's a kernel header */ +extern void aio_do_restarts(void); +extern int init_aio_irq(void); #endif Index: linux-2.6.17/arch/um/os-Linux/aio.c =================================================================== --- linux-2.6.17.orig/arch/um/os-Linux/aio.c 2007-11-19 11:58:13.000000000 -0500 +++ linux-2.6.17/arch/um/os-Linux/aio.c 2008-01-07 12:51:48.000000000 -0500 @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -15,14 +16,27 @@ #include "os.h" #include "user.h" -struct aio_thread_req { - enum aio_type type; - int io_fd; - unsigned long long offset; - char *buf; - int len; - struct aio_context *aio; -}; +static int update_aio(struct aio_context *aio, int res) +{ + if(res < 0) + aio->len = res; + else if((res == 0) && (aio->type == AIO_READ)){ + /* This is the EOF case - we have hit the end of the file + * and it ends in a partial block, so we fill the end of + * the block with zeros and claim success. + */ + memset(aio->data, 0, aio->len); + aio->len = 0; + } + else if(res > 0){ + aio->len -= res; + aio->data += res; + aio->offset += res; + return aio->len; + } + + return 0; +} #if defined(HAVE_AIO_ABI) #include @@ -64,19 +78,18 @@ static long io_getevents(aio_context_t c * that it now backs the mmapped area. */ -static int do_aio(aio_context_t ctx, enum aio_type type, int fd, char *buf, - int len, unsigned long long offset, struct aio_context *aio) +static int do_aio(aio_context_t ctx, struct aio_context *aio) { struct iocb *iocbp = & ((struct iocb) { .aio_data = (unsigned long) aio, - .aio_fildes = fd, - .aio_buf = (unsigned long) buf, - .aio_nbytes = len, - .aio_offset = offset + .aio_fildes = aio->fd, + .aio_buf = (unsigned long) aio->data, + .aio_nbytes = aio->len, + .aio_offset = aio->offset, }); char c; - switch (type) { + switch (aio->type) { case AIO_READ: iocbp->aio_lio_opcode = IOCB_CMD_PREAD; break; @@ -89,7 +102,7 @@ static int do_aio(aio_context_t ctx, enu iocbp->aio_nbytes = sizeof(c); break; default: - printk(UM_KERN_ERR "Bogus op in do_aio - %d\n", type); + printk(UM_KERN_ERR "Bogus op in do_aio - %d\n", aio->type); return -EINVAL; } @@ -99,11 +112,13 @@ static int do_aio(aio_context_t ctx, enu /* Initialized in an initcall and unchanged thereafter */ static aio_context_t ctx = 0; +static int aio_reply_w = -1; + static int aio_thread(void *arg) { - struct aio_thread_reply reply; + struct aio_context *aio; struct io_event event; - int err, n, reply_fd; + int err, n; signal(SIGWINCH, SIG_IGN); @@ -116,14 +131,14 @@ static int aio_thread(void *arg) "errno = %d\n", errno); } else { - reply = ((struct aio_thread_reply) - { .data = (void *) (long) event.data, - .err = event.res }); - reply_fd = ((struct aio_context *) reply.data)->reply_fd; - err = write(reply_fd, &reply, sizeof(reply)); - if (err != sizeof(reply)) - printk(UM_KERN_ERR "aio_thread - write failed, " - "fd = %d, err = %d\n", reply_fd, errno); + /* This is safe as we've just a pointer here. */ + aio = (struct aio_context *) (long) event.data; + update_aio(aio, event.res); + err = write(aio_reply_w, &aio, sizeof(aio)); + if (err != sizeof(aio)) + printk("aio_thread - write failed, " + "fd = %d, err = %d\n", aio_reply_w, + errno); } } return 0; @@ -131,29 +146,29 @@ static int aio_thread(void *arg) #endif -static int do_not_aio(struct aio_thread_req *req) +static int do_not_aio(struct aio_context *aio) { char c; unsigned long long actual; int n; - actual = lseek64(req->io_fd, req->offset, SEEK_SET); - if (actual != req->offset) + actual = lseek64(aio->fd, aio->offset, SEEK_SET); + if (actual != aio->offset) return -errno; - switch(req->type) { + switch (aio->type) { case AIO_READ: - n = read(req->io_fd, req->buf, req->len); + n = read(aio->fd, aio->data, aio->len); break; case AIO_WRITE: - n = write(req->io_fd, req->buf, req->len); + n = write(aio->fd, aio->data, aio->len); break; case AIO_MMAP: - n = read(req->io_fd, &c, sizeof(c)); + n = read(aio->fd, &c, sizeof(c)); break; default: printk(UM_KERN_ERR "do_not_aio - bad request type : %d\n", - req->type); + aio->type); return -EINVAL; } @@ -167,17 +182,17 @@ static int aio_req_fd_r = -1; static int aio_req_fd_w = -1; static int aio_pid = -1; static unsigned long aio_stack; +static int (*submit_proc)(struct aio_context *aio); static int not_aio_thread(void *arg) { - struct aio_thread_req req; - struct aio_thread_reply reply; + struct aio_context *aio; int err; signal(SIGWINCH, SIG_IGN); while (1) { - err = read(aio_req_fd_r, &req, sizeof(req)); - if (err != sizeof(req)) { + err = read(aio_req_fd_r, &aio, sizeof(aio)); + if (err != sizeof(aio)) { if (err < 0) printk(UM_KERN_ERR "not_aio_thread - " "read failed, fd = %d, err = %d\n", @@ -190,18 +205,30 @@ static int not_aio_thread(void *arg) } continue; } - err = do_not_aio(&req); - reply = ((struct aio_thread_reply) { .data = req.aio, - .err = err }); - err = write(req.aio->reply_fd, &reply, sizeof(reply)); - if (err != sizeof(reply)) - printk(UM_KERN_ERR "not_aio_thread - write failed, " - "fd = %d, err = %d\n", req.aio->reply_fd, errno); + + do err = do_not_aio(aio); + while (update_aio(aio, err)); + + err = write(aio_reply_w, &aio, sizeof(aio)); + if (err != sizeof(aio)) + printk("not_aio_thread - write failed, fd = %d, " + "err = %d\n", aio_reply_w, errno); } return 0; } +static int submit_aio_24(struct aio_context *aio) +{ + int err; + + err = write(aio_req_fd_w, &aio, sizeof(aio)); + if (err == sizeof(aio)) + err = 0; + + return err; +} + static int init_aio_24(void) { int fds[2], err; @@ -237,11 +264,34 @@ out: #endif printk(UM_KERN_INFO "2.6 host AIO support not used - falling back to " "I/O thread\n"); + + submit_proc = submit_aio_24; + return 0; } #ifdef HAVE_AIO_ABI #define DEFAULT_24_AIO 0 +static int submit_aio_26(struct aio_context *aio) +{ + int err; + + err = do_aio(ctx, aio); + if(err == -EAGAIN) + return err; + + if(err){ + aio->len = err; + err = write(aio_reply_w, &aio, sizeof(aio)); + if(err != sizeof(aio)) + printk("submit_aio_26 - write failed, " + "fd = %d, err = %d\n", aio_reply_w, -err); + else err = 0; + } + + return err; +} + static int init_aio_26(void) { int err; @@ -261,41 +311,21 @@ static int init_aio_26(void) aio_pid = err; printk(UM_KERN_INFO "Using 2.6 host AIO\n"); - return 0; -} - -static int submit_aio_26(enum aio_type type, int io_fd, char *buf, int len, - unsigned long long offset, struct aio_context *aio) -{ - struct aio_thread_reply reply; - int err; - - err = do_aio(ctx, type, io_fd, buf, len, offset, aio); - if (err) { - reply = ((struct aio_thread_reply) { .data = aio, - .err = err }); - err = write(aio->reply_fd, &reply, sizeof(reply)); - if (err != sizeof(reply)) { - err = -errno; - printk(UM_KERN_ERR "submit_aio_26 - write failed, " - "fd = %d, err = %d\n", aio->reply_fd, -err); - } - else err = 0; - } + submit_proc = submit_aio_26; - return err; + return 0; } #else #define DEFAULT_24_AIO 1 -static int init_aio_26(void) +static int submit_aio_26(struct aio_context *aio) { return -ENOSYS; } -static int submit_aio_26(enum aio_type type, int io_fd, char *buf, int len, - unsigned long long offset, struct aio_context *aio) +static int init_aio_26(void) { + submit_proc = submit_aio_26; return -ENOSYS; } #endif @@ -325,6 +355,11 @@ static int init_aio(void) { int err; + aio_reply_w = init_aio_irq(); + if(aio_reply_w < 0) + printk("Failed to initialize aio_reply_w, err = %d\n", + -aio_reply_w); + if (!aio_24) { err = init_aio_26(); if (err && (errno == ENOSYS)) { @@ -360,33 +395,7 @@ static void exit_aio(void) __uml_exitcall(exit_aio); -static int submit_aio_24(enum aio_type type, int io_fd, char *buf, int len, - unsigned long long offset, struct aio_context *aio) -{ - struct aio_thread_req req = { .type = type, - .io_fd = io_fd, - .offset = offset, - .buf = buf, - .len = len, - .aio = aio, - }; - int err; - - err = write(aio_req_fd_w, &req, sizeof(req)); - if (err == sizeof(req)) - err = 0; - else err = -errno; - - return err; -} - -int submit_aio(enum aio_type type, int io_fd, char *buf, int len, - unsigned long long offset, int reply_fd, - struct aio_context *aio) +int submit_aio(struct aio_context *aio) { - aio->reply_fd = reply_fd; - if (aio_24) - return submit_aio_24(type, io_fd, buf, len, offset, aio); - else - return submit_aio_26(type, io_fd, buf, len, offset, aio); + return (*submit_proc)(aio); } Index: linux-2.6.17/arch/um/include/aio-restart.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.17/arch/um/include/aio-restart.h 2007-11-19 16:42:39.000000000 -0500 @@ -0,0 +1,28 @@ +/* + * Copyright (C) 2006 Jeff Dike (jdike@addtoit.com) + * Licensed under the GPL + */ + +#ifndef __AIO_RESTART_H__ +#define __AIO_RESTART_H__ + +#include "linux/list.h" +#include "aio.h" + +struct aio_restart { + struct list_head list; + void (*restart_proc)(void *arg); + void *arg; +}; + +extern void aio_add_restart(struct aio_restart *restart); + +struct aio_driver { + struct list_head list; + void (*handler)(struct aio_context *); + struct aio_context *requests; +}; + +void register_aio_driver(struct aio_driver *driver); + +#endif Index: linux-2.6.17/arch/um/include/irq_kern.h =================================================================== --- linux-2.6.17.orig/arch/um/include/irq_kern.h 2007-11-19 11:58:13.000000000 -0500 +++ linux-2.6.17/arch/um/include/irq_kern.h 2007-11-19 16:42:39.000000000 -0500 @@ -1,5 +1,5 @@ /* - * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) + * Copyright (C) 2001 - 2006 Jeff Dike (jdike@addtoit.com) * Licensed under the GPL */ @@ -10,21 +10,7 @@ #include "asm/ptrace.h" extern int um_request_irq(unsigned int irq, int fd, int type, - irq_handler_t handler, - unsigned long irqflags, const char * devname, - void *dev_id); -extern int init_aio_irq(int irq, char *name, - irq_handler_t handler); + irq_handler_t handler, unsigned long irqflags, + const char *devname, void *dev_id); #endif - -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ Index: linux-2.6.17/arch/um/kernel/Makefile =================================================================== --- linux-2.6.17.orig/arch/um/kernel/Makefile 2007-11-19 11:58:13.000000000 -0500 +++ linux-2.6.17/arch/um/kernel/Makefile 2008-01-07 12:52:04.000000000 -0500 @@ -6,8 +6,8 @@ extra-y := vmlinux.lds clean-files := -obj-y = config.o exec.o exitcode.o init_task.o irq.o ksyms.o mem.o \ - physmem.o process.o ptrace.o reboot.o sigio.o \ +obj-y = aio.o config.o exec.o exitcode.o init_task.o irq.o \ + ksyms.o mem.o physmem.o process.o ptrace.o reboot.o sigio.o \ signal.o smp.o syscall.o sysrq.o time.o tlb.o trap.o uaccess.o \ um_arch.o umid.o skas/ Index: linux-2.6.17/arch/um/kernel/aio.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.17/arch/um/kernel/aio.c 2007-11-19 17:16:51.000000000 -0500 @@ -0,0 +1,168 @@ +/* + * Copyright (C) 2006 Jeff Dike (jdike@addtoit.com) + * Licensed under the GPL + */ + +#include "linux/interrupt.h" +#include "linux/spinlock.h" +#include "linux/list.h" +#include "aio.h" +#include "irq_kern.h" +#include "os.h" +#include "aio-restart.h" + +static DEFINE_SPINLOCK(restart_lock); +static LIST_HEAD(restarts); + +void aio_add_restart(struct aio_restart *restart) +{ + unsigned long flags; + + spin_lock_irqsave(&restart_lock, flags); + list_add(&restart->list, &restarts); + spin_unlock_irqrestore(&restart_lock, flags); +} + +void aio_do_restarts(void) +{ + struct aio_restart *restart; + struct list_head list; + unsigned long flags; + + /* We have to be able to tolerate the restart entry being immediately + * requeued. Restarting a queue may result in only part of the pending + * requests able to be queued to the host. We copy the restarts list + * to a local and empty restarts. Then, it's necessary to remove each + * element from the list before processing it so that it can be added + * back to the restarts list in the callback. + */ + + INIT_LIST_HEAD(&list); + spin_lock_irqsave(&restart_lock, flags); + list_splice_init(&restarts, &list); + spin_unlock_irqrestore(&restart_lock, flags); + + while(!list_empty(&list)){ + restart = list_entry(list.next, struct aio_restart, list); + list_del(&restart->list); + (*restart->restart_proc)(restart->arg); + } +} + +static DEFINE_SPINLOCK(driver_lock); +static LIST_HEAD(aio_drivers); + +/* + * Initialized by init_aio_irq, which is called from init_aio, which + * is an initcall. + */ +static int aio_reply_r = -1; + +/* It can happen that io_getevents will return a partially finished request. + * Dunno why. These just get resubmitted. However, it can also + * happen that the submission will return -EGAIN. To prevent the + * request from being lost, we stick it on this list and try + * submitting it the next time around, when maybe the host can deal + * with it. + */ +static struct aio_context *retry_list; + +static irqreturn_t aio_handler(int irq, void *dev) +{ + struct aio_driver *driver; + struct aio_context *aio, **prev; + struct list_head *ele; + unsigned long flags; + int err; + + prev = &retry_list; + aio = retry_list; + while(aio){ + err = submit_aio(aio); + if(err) + printk("Resubmitting 0x%p returned %d\n", aio, err); + if(err != -EAGAIN) + *prev = aio->next; + prev = &aio->next; + aio = aio->next; + } + + while(1){ + err = os_read_file(aio_reply_r, &aio, sizeof(aio)); + if(err < 0){ + if(err != -EAGAIN) + printk("aio_handler - read returned err %d\n", + -err); + break; + } + + if(aio->len > 0){ + err = submit_aio(aio); + if(err){ + printk("aio_handler - submit_aio 0x%p returned " + "%d\n", aio, err); + if(err == -EAGAIN){ + aio->next = retry_list; + retry_list = aio; + } + } + continue; + } + + driver = aio->driver; + aio->next = driver->requests; + driver->requests = aio; + } + + spin_lock_irqsave(&driver_lock, flags); + list_for_each(ele, &aio_drivers){ + driver = list_entry(ele, struct aio_driver, list); + driver->handler(driver->requests); + driver->requests = NULL; + } + spin_unlock_irqrestore(&driver_lock, flags); + + aio_do_restarts(); + + reactivate_fd(aio_reply_r, AIO_IRQ); + + return IRQ_HANDLED; +} + +void register_aio_driver(struct aio_driver *driver) +{ + unsigned long flags; + + spin_lock_irqsave(&driver_lock, flags); + list_add(&driver->list, &aio_drivers); + spin_unlock_irqrestore(&driver_lock, flags); +} + +int init_aio_irq(void) +{ + int fds[2], err; + + err = os_pipe(fds, 1, 1); + if (err) { + printk("init_aio_irq - os_pipe failed, err = %d\n", -err); + goto out; + } + + err = um_request_irq(AIO_IRQ, fds[0], IRQ_READ, aio_handler, + IRQF_DISABLED | IRQF_SAMPLE_RANDOM, "aio", NULL); + if (err) { + printk("init_aio_irq - : um_request_irq failed, err = %d\n", + err); + goto out_close; + } + + aio_reply_r = fds[0]; + + return fds[1]; + + out_close: + os_close_file(fds[0]); + os_close_file(fds[1]); + out: + return err; +} Index: linux-2.6.17/include/asm-um/irq.h =================================================================== --- linux-2.6.17.orig/include/asm-um/irq.h 2007-11-19 11:58:13.000000000 -0500 +++ linux-2.6.17/include/asm-um/irq.h 2008-01-07 12:52:00.000000000 -0500 @@ -15,8 +15,9 @@ #define SIGIO_WRITE_IRQ 11 #define TELNETD_IRQ 12 #define XTERM_IRQ 13 +#define AIO_IRQ 14 -#define LAST_IRQ XTERM_IRQ +#define LAST_IRQ AIO_IRQ #define NR_IRQS (LAST_IRQ + 1) #endif Index: linux-2.6.17/arch/um/kernel/irq.c =================================================================== --- linux-2.6.17.orig/arch/um/kernel/irq.c 2007-11-19 11:58:13.000000000 -0500 +++ linux-2.6.17/arch/um/kernel/irq.c 2008-01-07 12:51:10.000000000 -0500 @@ -405,37 +405,6 @@ void __init init_IRQ(void) } } -int init_aio_irq(int irq, char *name, irq_handler_t handler) -{ - int fds[2], err; - - err = os_pipe(fds, 1, 1); - if (err) { - printk(KERN_ERR "init_aio_irq - os_pipe failed, err = %d\n", - -err); - goto out; - } - - err = um_request_irq(irq, fds[0], IRQ_READ, handler, - IRQF_DISABLED | IRQF_SAMPLE_RANDOM, name, - (void *) (long) fds[0]); - if (err) { - printk(KERN_ERR "init_aio_irq - : um_request_irq failed, " - "err = %d\n", - err); - goto out_close; - } - - err = fds[1]; - goto out; - - out_close: - os_close_file(fds[0]); - os_close_file(fds[1]); - out: - return err; -} - /* * IRQ stack entry and exit: *