/*P:100 This is the Launcher code, a simple program which lays out the
- * "physical" memory for the new Guest by mapping the kernel image and the
- * virtual devices, then reads repeatedly from /dev/lguest to run the Guest.
-:*/
+ * "physical" memory for the new Guest by mapping the kernel image and
+ * the virtual devices, then opens /dev/lguest to tell the kernel
+ * about the Guest and control it. :*/
#define _LARGEFILE64_SOURCE
#define _GNU_SOURCE
#include <stdio.h>
#include "linux/virtio_console.h"
#include "linux/virtio_ring.h"
#include "asm-x86/bootparam.h"
-/*L:110 We can ignore the 38 include files we need for this program, but I do
+/*L:110 We can ignore the 39 include files we need for this program, but I do
* want to draw attention to the use of kernel-style types.
*
* As Linus said, "C is a Spartan language, and so should your naming be." I
/* Any queues attached to this device */
struct virtqueue *vq;
+ /* Handle status being finalized (ie. feature bits stable). */
+ void (*ready)(struct device *me);
+
/* Device-specific data. */
void *priv;
};
#define le32_to_cpu(v32) (v32)
#define le64_to_cpu(v64) (v64)
+/* The device virtqueue descriptors are followed by feature bitmasks. */
+static u8 *get_feature_bits(struct device *dev)
+{
+ return (u8 *)(dev->desc + 1)
+ + dev->desc->num_vq * sizeof(struct lguest_vqconfig);
+}
+
/*L:100 The Launcher code itself takes us out into userspace, that scary place
* where pointers run wild and free! Unfortunately, like most userspace
* programs, it's quite boring (which is why everyone likes to hack on the
err(1, "Reading program headers");
/* Try all the headers: there are usually only three. A read-only one,
- * a read-write one, and a "note" section which isn't loadable. */
+ * a read-write one, and a "note" section which we don't load. */
for (i = 0; i < ehdr->e_phnum; i++) {
/* If this isn't a loadable segment, we ignore it */
if (phdr[i].p_type != PT_LOAD)
if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
return map_elf(fd, &hdr);
- /* Otherwise we assume it's a bzImage, and try to unpack it */
+ /* Otherwise we assume it's a bzImage, and try to load it. */
return load_bzimage(fd);
}
return len;
}
-/* Once we know how much memory we have, we can construct simple linear page
+/* Once we know how much memory we have we can construct simple linear page
* tables which set virtual == physical which will get the Guest far enough
* into the boot to create its own.
*
* We lay them out of the way, just below the initrd (which is why we need to
- * know its size). */
+ * know its size here). */
static unsigned long setup_pagetables(unsigned long mem,
unsigned long initrd_size)
{
unsigned int i, len = 0;
for (i = 0; args[i]; i++) {
+ if (i) {
+ strcat(dst+len, " ");
+ len++;
+ }
strcpy(dst+len, args[i]);
- strcat(dst+len, " ");
- len += strlen(args[i]) + 1;
+ len += strlen(args[i]);
}
/* In case it's empty. */
dst[len] = '\0';
*
* Handling output for network is also simple: we get all the output buffers
* and write them (ignoring the first element) to this device's file descriptor
- * (stdout). */
+ * (/dev/net/tun).
+ */
static void handle_net_output(int fd, struct virtqueue *vq)
{
unsigned int head, out, in;
write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd));
}
+/* When the Guest tells us they updated the status field, we handle it. */
+static void update_device_status(struct device *dev)
+{
+ struct virtqueue *vq;
+
+ /* This is a reset. */
+ if (dev->desc->status == 0) {
+ verbose("Resetting device %s\n", dev->name);
+
+ /* Clear any features they've acked. */
+ memset(get_feature_bits(dev) + dev->desc->feature_len, 0,
+ dev->desc->feature_len);
+
+ /* Zero out the virtqueues. */
+ for (vq = dev->vq; vq; vq = vq->next) {
+ memset(vq->vring.desc, 0,
+ vring_size(vq->config.num, getpagesize()));
+ vq->last_avail_idx = 0;
+ }
+ } else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) {
+ warnx("Device %s configuration FAILED", dev->name);
+ } else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) {
+ unsigned int i;
+
+ verbose("Device %s OK: offered", dev->name);
+ for (i = 0; i < dev->desc->feature_len; i++)
+ verbose(" %08x", get_feature_bits(dev)[i]);
+ verbose(", accepted");
+ for (i = 0; i < dev->desc->feature_len; i++)
+ verbose(" %08x", get_feature_bits(dev)
+ [dev->desc->feature_len+i]);
+
+ if (dev->ready)
+ dev->ready(dev);
+ }
+}
+
/* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */
static void handle_output(int fd, unsigned long addr)
{
struct device *i;
struct virtqueue *vq;
- /* Check each virtqueue. */
+ /* Check each device and virtqueue. */
for (i = devices.dev; i; i = i->next) {
+ /* Notifications to device descriptors update device status. */
+ if (from_guest_phys(addr) == i->desc) {
+ update_device_status(i);
+ return;
+ }
+
+ /* Notifications to virtqueues mean output has occurred. */
for (vq = i->vq; vq; vq = vq->next) {
- if (vq->config.pfn == addr/getpagesize()
- && vq->handle_output) {
- verbose("Output to %s\n", vq->dev->name);
- vq->handle_output(fd, vq);
+ if (vq->config.pfn != addr/getpagesize())
+ continue;
+
+ /* Guest should acknowledge (and set features!) before
+ * using the device. */
+ if (i->desc->status == 0) {
+ warnx("%s gave early output", i->name);
return;
}
+
+ if (strcmp(vq->dev->name, "console") != 0)
+ verbose("Output to %s\n", vq->dev->name);
+ if (vq->handle_output)
+ vq->handle_output(fd, vq);
+ return;
}
}
if (select(devices.max_infd+1, &fds, NULL, NULL, &poll) == 0)
break;
- /* Otherwise, call the device(s) which have readable
- * file descriptors and a method of handling them. */
+ /* Otherwise, call the device(s) which have readable file
+ * descriptors and a method of handling them. */
for (i = devices.dev; i; i = i->next) {
if (i->handle_input && FD_ISSET(i->fd, &fds)) {
int dev_fd;
* should no longer service it. Networking and
* console do this when there's no input
* buffers to deliver into. Console also uses
- * it when it discovers that stdin is
- * closed. */
+ * it when it discovers that stdin is closed. */
FD_CLR(i->fd, &devices.infds);
/* Tell waker to ignore it too, by sending a
* negative fd number (-1, since 0 is a valid
*
* All devices need a descriptor so the Guest knows it exists, and a "struct
* device" so the Launcher can keep track of it. We have common helper
- * routines to allocate and manage them. */
+ * routines to allocate and manage them.
+ */
/* The layout of the device page is a "struct lguest_device_desc" followed by a
* number of virtqueue descriptors, then two sets of feature bits, then an
struct virtqueue **i, *vq = malloc(sizeof(*vq));
void *p;
- /* First we need some pages for this virtqueue. */
+ /* First we need some memory for this virtqueue. */
pages = (vring_size(num_descs, getpagesize()) + getpagesize() - 1)
/ getpagesize();
p = get_pages(pages);
* virtqueue. */
vq->handle_output = handle_output;
- /* Set the "Don't Notify Me" flag if we don't have a handler */
+ /* As an optimization, set the advisory "Don't Notify Me" flag if we
+ * don't have a handler */
if (!handle_output)
vq->vring.used->flags = VRING_USED_F_NO_NOTIFY;
}
-/* The virtqueue descriptors are followed by feature bytes. */
+/* The first half of the feature bitmask is for us to advertise features. The
+ * second half is for the Guest to accept features. */
static void add_feature(struct device *dev, unsigned bit)
{
- u8 *features;
+ u8 *features = get_feature_bits(dev);
/* We can't extend the feature bits once we've added config bytes */
if (dev->desc->feature_len <= bit / CHAR_BIT) {
dev->desc->feature_len = (bit / CHAR_BIT) + 1;
}
- features = (u8 *)(dev->desc + 1)
- + dev->desc->num_vq * sizeof(struct lguest_vqconfig);
-
features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT));
}
}
/* This routine does all the creation and setup of a new device, including
- * calling new_dev_desc() to allocate the descriptor and device memory. */
+ * calling new_dev_desc() to allocate the descriptor and device memory.
+ *
+ * See what I mean about userspace being boring? */
static struct device *new_device(const char *name, u16 type, int fd,
bool (*handle_input)(int, struct device *))
{
dev->handle_input = handle_input;
dev->name = name;
dev->vq = NULL;
+ dev->ready = NULL;
/* Append to device list. Prepending to a single-linked list is
* easier, but the user expects the devices to be arranged on the bus
* Launcher triggers interrupt to Guest. */
int done_fd;
};
-/*:*/
/*L:210
* The Disk
struct vblk_info *vblk = dev->priv;
unsigned int head, out_num, in_num, wlen;
int ret;
- struct virtio_blk_inhdr *in;
+ u8 *in;
struct virtio_blk_outhdr *out;
struct iovec iov[dev->vq->vring.num];
off64_t off;
head, out_num, in_num);
out = convert(&iov[0], struct virtio_blk_outhdr);
- in = convert(&iov[out_num+in_num-1], struct virtio_blk_inhdr);
+ in = convert(&iov[out_num+in_num-1], u8);
off = out->sector * 512;
/* The block device implements "barriers", where the Guest indicates
* It'd be nice if we supported eject, for example, but we don't. */
if (out->type & VIRTIO_BLK_T_SCSI_CMD) {
fprintf(stderr, "Scsi commands unsupported\n");
- in->status = VIRTIO_BLK_S_UNSUPP;
+ *in = VIRTIO_BLK_S_UNSUPP;
wlen = sizeof(*in);
} else if (out->type & VIRTIO_BLK_T_OUT) {
/* Write */
errx(1, "Write past end %llu+%u", off, ret);
}
wlen = sizeof(*in);
- in->status = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
+ *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
} else {
/* Read */
verbose("READ from sector %llu: %i\n", out->sector, ret);
if (ret >= 0) {
wlen = sizeof(*in) + ret;
- in->status = VIRTIO_BLK_S_OK;
+ *in = VIRTIO_BLK_S_OK;
} else {
wlen = sizeof(*in);
- in->status = VIRTIO_BLK_S_IOERR;
+ *in = VIRTIO_BLK_S_IOERR;
}
}
while (read(vblk->workpipe[0], &c, 1) == 1) {
/* We acknowledge each request immediately to reduce latency,
* rather than waiting until we've done them all. I haven't
- * measured to see if it makes any difference. */
+ * measured to see if it makes any difference.
+ *
+ * That would be an interesting test, wouldn't it? You could
+ * also try having more than one I/O thread. */
while (service_io(dev))
write(vblk->done_fd, &c, 1);
}
}
/* Now we've seen the I/O thread, we return to the Launcher to see what happens
- * when the thread tells us it's completed some I/O. */
+ * when that thread tells us it's completed some I/O. */
static bool handle_io_finish(int fd, struct device *dev)
{
char c;
* more work. */
pipe(vblk->workpipe);
- /* Create stack for thread and run it */
+ /* Create stack for thread and run it. Since stack grows upwards, we
+ * point the stack pointer to the end of this region. */
stack = malloc(32768);
/* SIGCHLD - We dont "wait" for our cloned thread, so prevent it from
* becoming a zombie. */
- if (clone(io_thread, stack + 32768, CLONE_VM | SIGCHLD, dev) == -1)
+ if (clone(io_thread, stack + 32768, CLONE_VM | SIGCHLD, dev) == -1)
err(1, "Creating clone");
/* We don't need to keep the I/O thread's end of the pipes open. */
verbose("device %u: virtblock %llu sectors\n",
devices.device_num, le64_to_cpu(conf.capacity));
}
-/* That's the end of device setup. :*/
+/* That's the end of device setup. */
-/* Reboot */
+/*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */
static void __attribute__((noreturn)) restart_guest(void)
{
unsigned int i;
- /* Closing pipes causes the waker thread and io_threads to die, and
+ /* Closing pipes causes the Waker thread and io_threads to die, and
* closing /dev/lguest cleans up the Guest. Since we don't track all
* open fds, we simply close everything beyond stderr. */
for (i = 3; i < FD_SETSIZE; i++)
err(1, "Could not exec %s", main_args[0]);
}
-/*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves
+/*L:220 Finally we reach the core of the Launcher which runs the Guest, serves
* its input and output, and finally, lays it to rest. */
static void __attribute__((noreturn)) run_guest(int lguest_fd)
{
err(1, "Resetting break");
}
}
-/*
+/*L:240
* This is the end of the Launcher. The good news: we are over halfway
* through! The bad news: the most fiendish part of the code still lies ahead
* of us.
* device receive input from a file descriptor, we keep an fdset
* (infds) and the maximum fd number (max_infd) with the head of the
* list. We also keep a pointer to the last device. Finally, we keep
- * the next interrupt number to hand out (1: remember that 0 is used by
- * the timer). */
+ * the next interrupt number to use for devices (1: remember that 0 is
+ * used by the timer). */
FD_ZERO(&devices.infds);
devices.max_infd = -1;
devices.lastdev = NULL;
lguest_fd = tell_kernel(pgdir, start);
/* We fork off a child process, which wakes the Launcher whenever one
- * of the input file descriptors needs attention. Otherwise we would
- * run the Guest until it tries to output something. */
+ * of the input file descriptors needs attention. We call this the
+ * Waker, and we'll cover it in a moment. */
waker_fd = setup_waker(lguest_fd);
/* Finally, run the Guest. This doesn't return. */