/*P:100 This is the Launcher code, a simple program which lays out the
- * "physical" memory for the new Guest by mapping the kernel image and the
- * virtual devices, then reads repeatedly from /dev/lguest to run the Guest.
-:*/
+ * "physical" memory for the new Guest by mapping the kernel image and
+ * the virtual devices, then opens /dev/lguest to tell the kernel
+ * about the Guest and control it. :*/
#define _LARGEFILE64_SOURCE
#define _GNU_SOURCE
#include <stdio.h>
#include "linux/virtio_console.h"
#include "linux/virtio_ring.h"
#include "asm-x86/bootparam.h"
-/*L:110 We can ignore the 38 include files we need for this program, but I do
+/*L:110 We can ignore the 39 include files we need for this program, but I do
* want to draw attention to the use of kernel-style types.
*
* As Linus said, "C is a Spartan language, and so should your naming be." I
/* Any queues attached to this device */
struct virtqueue *vq;
+ /* Handle status being finalized (ie. feature bits stable). */
+ void (*ready)(struct device *me);
+
/* Device-specific data. */
void *priv;
};
/* The routine to call when the Guest pings us. */
void (*handle_output)(int fd, struct virtqueue *me);
+
+ /* Outstanding buffers */
+ unsigned int inflight;
};
/* Remember the arguments to the program so we can "reboot" */
err(1, "Reading program headers");
/* Try all the headers: there are usually only three. A read-only one,
- * a read-write one, and a "note" section which isn't loadable. */
+ * a read-write one, and a "note" section which we don't load. */
for (i = 0; i < ehdr->e_phnum; i++) {
/* If this isn't a loadable segment, we ignore it */
if (phdr[i].p_type != PT_LOAD)
if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
return map_elf(fd, &hdr);
- /* Otherwise we assume it's a bzImage, and try to unpack it */
+ /* Otherwise we assume it's a bzImage, and try to load it. */
return load_bzimage(fd);
}
return len;
}
-/* Once we know how much memory we have, we can construct simple linear page
+/* Once we know how much memory we have we can construct simple linear page
* tables which set virtual == physical which will get the Guest far enough
* into the boot to create its own.
*
* We lay them out of the way, just below the initrd (which is why we need to
- * know its size). */
+ * know its size here). */
static unsigned long setup_pagetables(unsigned long mem,
unsigned long initrd_size)
{
unsigned int i, len = 0;
for (i = 0; args[i]; i++) {
+ if (i) {
+ strcat(dst+len, " ");
+ len++;
+ }
strcpy(dst+len, args[i]);
- strcat(dst+len, " ");
- len += strlen(args[i]) + 1;
+ len += strlen(args[i]);
}
/* In case it's empty. */
dst[len] = '\0';
errx(1, "Looped descriptor");
} while ((i = next_desc(vq, i)) != vq->vring.num);
+ vq->inflight++;
return head;
}
/* Make sure buffer is written before we update index. */
wmb();
vq->vring.used->idx++;
+ vq->inflight--;
}
/* This actually sends the interrupt for this virtqueue */
{
unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };
- /* If they don't want an interrupt, don't send one. */
- if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
+ /* If they don't want an interrupt, don't send one, unless empty. */
+ if ((vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
+ && vq->inflight)
return;
/* Send the Guest an interrupt tell them we used something up. */
*
* Handling output for network is also simple: we get all the output buffers
* and write them (ignoring the first element) to this device's file descriptor
- * (stdout). */
+ * (/dev/net/tun).
+ */
static void handle_net_output(int fd, struct virtqueue *vq)
{
unsigned int head, out, in;
write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd));
}
-/* Resetting a device is fairly easy. */
-static void reset_device(struct device *dev)
+/* When the Guest tells us they updated the status field, we handle it. */
+static void update_device_status(struct device *dev)
{
struct virtqueue *vq;
- verbose("Resetting device %s\n", dev->name);
- /* Clear the status. */
- dev->desc->status = 0;
+ /* This is a reset. */
+ if (dev->desc->status == 0) {
+ verbose("Resetting device %s\n", dev->name);
- /* Clear any features they've acked. */
- memset(get_feature_bits(dev) + dev->desc->feature_len, 0,
- dev->desc->feature_len);
+ /* Clear any features they've acked. */
+ memset(get_feature_bits(dev) + dev->desc->feature_len, 0,
+ dev->desc->feature_len);
- /* Zero out the virtqueues. */
- for (vq = dev->vq; vq; vq = vq->next) {
- memset(vq->vring.desc, 0,
- vring_size(vq->config.num, getpagesize()));
- vq->last_avail_idx = 0;
+ /* Zero out the virtqueues. */
+ for (vq = dev->vq; vq; vq = vq->next) {
+ memset(vq->vring.desc, 0,
+ vring_size(vq->config.num, getpagesize()));
+ vq->last_avail_idx = 0;
+ }
+ } else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) {
+ warnx("Device %s configuration FAILED", dev->name);
+ } else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) {
+ unsigned int i;
+
+ verbose("Device %s OK: offered", dev->name);
+ for (i = 0; i < dev->desc->feature_len; i++)
+ verbose(" %08x", get_feature_bits(dev)[i]);
+ verbose(", accepted");
+ for (i = 0; i < dev->desc->feature_len; i++)
+ verbose(" %08x", get_feature_bits(dev)
+ [dev->desc->feature_len+i]);
+
+ if (dev->ready)
+ dev->ready(dev);
}
}
/* Check each device and virtqueue. */
for (i = devices.dev; i; i = i->next) {
- /* Notifications to device descriptors reset the device. */
+ /* Notifications to device descriptors update device status. */
if (from_guest_phys(addr) == i->desc) {
- reset_device(i);
+ update_device_status(i);
return;
}
if (select(devices.max_infd+1, &fds, NULL, NULL, &poll) == 0)
break;
- /* Otherwise, call the device(s) which have readable
- * file descriptors and a method of handling them. */
+ /* Otherwise, call the device(s) which have readable file
+ * descriptors and a method of handling them. */
for (i = devices.dev; i; i = i->next) {
if (i->handle_input && FD_ISSET(i->fd, &fds)) {
int dev_fd;
* should no longer service it. Networking and
* console do this when there's no input
* buffers to deliver into. Console also uses
- * it when it discovers that stdin is
- * closed. */
+ * it when it discovers that stdin is closed. */
FD_CLR(i->fd, &devices.infds);
/* Tell waker to ignore it too, by sending a
* negative fd number (-1, since 0 is a valid
*
* All devices need a descriptor so the Guest knows it exists, and a "struct
* device" so the Launcher can keep track of it. We have common helper
- * routines to allocate and manage them. */
+ * routines to allocate and manage them.
+ */
/* The layout of the device page is a "struct lguest_device_desc" followed by a
* number of virtqueue descriptors, then two sets of feature bits, then an
struct virtqueue **i, *vq = malloc(sizeof(*vq));
void *p;
- /* First we need some pages for this virtqueue. */
+ /* First we need some memory for this virtqueue. */
pages = (vring_size(num_descs, getpagesize()) + getpagesize() - 1)
/ getpagesize();
p = get_pages(pages);
vq->next = NULL;
vq->last_avail_idx = 0;
vq->dev = dev;
+ vq->inflight = 0;
/* Initialize the configuration. */
vq->config.num = num_descs;
}
/* The first half of the feature bitmask is for us to advertise features. The
- * second half if for the Guest to accept features. */
+ * second half is for the Guest to accept features. */
static void add_feature(struct device *dev, unsigned bit)
{
u8 *features = get_feature_bits(dev);
}
/* This routine does all the creation and setup of a new device, including
- * calling new_dev_desc() to allocate the descriptor and device memory. */
+ * calling new_dev_desc() to allocate the descriptor and device memory.
+ *
+ * See what I mean about userspace being boring? */
static struct device *new_device(const char *name, u16 type, int fd,
bool (*handle_input)(int, struct device *))
{
dev->handle_input = handle_input;
dev->name = name;
dev->vq = NULL;
+ dev->ready = NULL;
/* Append to device list. Prepending to a single-linked list is
* easier, but the user expects the devices to be arranged on the bus
/* Tell Guest what MAC address to use. */
add_feature(dev, VIRTIO_NET_F_MAC);
+ add_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY);
set_config(dev, sizeof(conf), &conf);
/* We don't need the socket any more; setup is done. */
* Launcher triggers interrupt to Guest. */
int done_fd;
};
-/*:*/
/*L:210
* The Disk
struct vblk_info *vblk = dev->priv;
unsigned int head, out_num, in_num, wlen;
int ret;
- struct virtio_blk_inhdr *in;
+ u8 *in;
struct virtio_blk_outhdr *out;
struct iovec iov[dev->vq->vring.num];
off64_t off;
head, out_num, in_num);
out = convert(&iov[0], struct virtio_blk_outhdr);
- in = convert(&iov[out_num+in_num-1], struct virtio_blk_inhdr);
+ in = convert(&iov[out_num+in_num-1], u8);
off = out->sector * 512;
/* The block device implements "barriers", where the Guest indicates
* It'd be nice if we supported eject, for example, but we don't. */
if (out->type & VIRTIO_BLK_T_SCSI_CMD) {
fprintf(stderr, "Scsi commands unsupported\n");
- in->status = VIRTIO_BLK_S_UNSUPP;
+ *in = VIRTIO_BLK_S_UNSUPP;
wlen = sizeof(*in);
} else if (out->type & VIRTIO_BLK_T_OUT) {
/* Write */
errx(1, "Write past end %llu+%u", off, ret);
}
wlen = sizeof(*in);
- in->status = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
+ *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
} else {
/* Read */
verbose("READ from sector %llu: %i\n", out->sector, ret);
if (ret >= 0) {
wlen = sizeof(*in) + ret;
- in->status = VIRTIO_BLK_S_OK;
+ *in = VIRTIO_BLK_S_OK;
} else {
wlen = sizeof(*in);
- in->status = VIRTIO_BLK_S_IOERR;
+ *in = VIRTIO_BLK_S_IOERR;
}
}
while (read(vblk->workpipe[0], &c, 1) == 1) {
/* We acknowledge each request immediately to reduce latency,
* rather than waiting until we've done them all. I haven't
- * measured to see if it makes any difference. */
+ * measured to see if it makes any difference.
+ *
+ * That would be an interesting test, wouldn't it? You could
+ * also try having more than one I/O thread. */
while (service_io(dev))
write(vblk->done_fd, &c, 1);
}
}
/* Now we've seen the I/O thread, we return to the Launcher to see what happens
- * when the thread tells us it's completed some I/O. */
+ * when that thread tells us it's completed some I/O. */
static bool handle_io_finish(int fd, struct device *dev)
{
char c;
* more work. */
pipe(vblk->workpipe);
- /* Create stack for thread and run it */
+ /* Create stack for thread and run it. Since stack grows upwards, we
+ * point the stack pointer to the end of this region. */
stack = malloc(32768);
/* SIGCHLD - We dont "wait" for our cloned thread, so prevent it from
* becoming a zombie. */
- if (clone(io_thread, stack + 32768, CLONE_VM | SIGCHLD, dev) == -1)
+ if (clone(io_thread, stack + 32768, CLONE_VM | SIGCHLD, dev) == -1)
err(1, "Creating clone");
/* We don't need to keep the I/O thread's end of the pipes open. */
verbose("device %u: virtblock %llu sectors\n",
devices.device_num, le64_to_cpu(conf.capacity));
}
-/* That's the end of device setup. :*/
+/* That's the end of device setup. */
-/* Reboot */
+/*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */
static void __attribute__((noreturn)) restart_guest(void)
{
unsigned int i;
- /* Closing pipes causes the waker thread and io_threads to die, and
+ /* Closing pipes causes the Waker thread and io_threads to die, and
* closing /dev/lguest cleans up the Guest. Since we don't track all
* open fds, we simply close everything beyond stderr. */
for (i = 3; i < FD_SETSIZE; i++)
err(1, "Could not exec %s", main_args[0]);
}
-/*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves
+/*L:220 Finally we reach the core of the Launcher which runs the Guest, serves
* its input and output, and finally, lays it to rest. */
static void __attribute__((noreturn)) run_guest(int lguest_fd)
{
err(1, "Resetting break");
}
}
-/*
+/*L:240
* This is the end of the Launcher. The good news: we are over halfway
* through! The bad news: the most fiendish part of the code still lies ahead
* of us.
* device receive input from a file descriptor, we keep an fdset
* (infds) and the maximum fd number (max_infd) with the head of the
* list. We also keep a pointer to the last device. Finally, we keep
- * the next interrupt number to hand out (1: remember that 0 is used by
- * the timer). */
+ * the next interrupt number to use for devices (1: remember that 0 is
+ * used by the timer). */
FD_ZERO(&devices.infds);
devices.max_infd = -1;
devices.lastdev = NULL;
lguest_fd = tell_kernel(pgdir, start);
/* We fork off a child process, which wakes the Launcher whenever one
- * of the input file descriptors needs attention. Otherwise we would
- * run the Guest until it tries to output something. */
+ * of the input file descriptors needs attention. We call this the
+ * Waker, and we'll cover it in a moment. */
waker_fd = setup_waker(lguest_fd);
/* Finally, run the Guest. This doesn't return. */