[prev in list] [next in list] [prev in thread] [next in thread] 

List:       openbsd-tech
Subject:    vmd(8) vionet: adapt zero-copy approach from vioblk
From:       Dave Voutila <dv () sisu ! io>
Date:       2023-10-29 14:04:44
Message-ID: 87o7gh30z1.fsf () mars ! sisu ! io
[Download RAW message or body]

Recently, I rewrote parts of the emulated virtio block device in vmd(8)
to fix issues found bringing up NetBSD as a guest. In the process, the
rewrite introduced the chance to do zero-copy transfers for raw block
devices. The diff below brings that design to the virtio network device.

In summary:

  - flips the logic to only read from the tap(4) when there's space in
    the rx virtqueue instead of just when the tap is readable

  - uses readv/writev when not needing to enforce locked lladdr or do
    local interface packet inspection for DHCP intercept

  - redesigns packet injection for DHCP replies by using a pipe,
    allowing it to be handled by the rx event path

  - puts the device into "needs reset" state under error conditions so
    the driver will know something is borked

  - drops the include of sys/param.h as it switches the static rx buffer
    to be based on maximum tx size and not PAGE_SIZE

Don't expect throughput increases, but you may notice improved tail
latency and decreased cpu utilization under networking load, especially
if you're not using vmd(8)'s "local" interfaces.

ok's, feedback?

-dv

diffstat refs/heads/master refs/heads/vmd-vionet-rewrite
 M  usr.sbin/vmd/vionet.c  |  445+  301-
 M  usr.sbin/vmd/virtio.h  |    0+    7-

2 files changed, 445 insertions(+), 308 deletions(-)

diff refs/heads/master refs/heads/vmd-vionet-rewrite
commit - dae64e9e225d0c4320fe266e6b924a1630222416
commit + 7743ecfaef9332d7e7df5932ff472c6a00225414
blob - 740296a125e15f905e370e84ecf07163c58e2c99
blob + e08383130c5f19a25dcc3620cb31f4e5bab09b0d
--- usr.sbin/vmd/vionet.c
+++ usr.sbin/vmd/vionet.c
@@ -16,9 +16,8 @@
  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
-#include <sys/mman.h>
-#include <sys/param.h> /* PAGE_SIZE */
 #include <sys/socket.h>
+#include <sys/types.h>

 #include <dev/pci/virtio_pcireg.h>
 #include <dev/pv/virtioreg.h>
@@ -26,7 +25,6 @@
 #include <net/if.h>
 #include <netinet/in.h>
 #include <netinet/if_ether.h>
-#include <netinet/ip.h>

 #include <errno.h>
 #include <event.h>
@@ -36,7 +34,6 @@
 #include <unistd.h>

 #include "atomicio.h"
-#include "pci.h"
 #include "virtio.h"
 #include "vmd.h"

@@ -47,20 +44,36 @@
 extern char *__progname;
 extern struct vmd_vm *current_vm;

-/* Device Globals */
-struct event ev_tap;
+struct packet {
+	uint8_t	*buf;
+	size_t	 len;
+};

-static int vionet_rx(struct vionet_dev *);
+static int vionet_rx(struct vionet_dev *, int);
+static ssize_t vionet_rx_copy(struct vionet_dev *, int, const struct iovec *,
+    int, size_t);
+static ssize_t vionet_rx_zerocopy(struct vionet_dev *, int,
+    const struct iovec *, int);
 static void vionet_rx_event(int, short, void *);
 static uint32_t handle_io_read(struct viodev_msg *, struct virtio_dev *,
     int8_t *);
 static int handle_io_write(struct viodev_msg *, struct virtio_dev *);
-void vionet_notify_rx(struct virtio_dev *);
-int vionet_notifyq(struct virtio_dev *);
+static int vionet_notify_tx(struct virtio_dev *);
+static int vionet_notifyq(struct virtio_dev *);

 static void dev_dispatch_vm(int, short, void *);
 static void handle_sync_io(int, short, void *);

+/* Device Globals */
+struct event ev_tap;
+struct event ev_inject;
+int pipe_inject[2];
+#define READ	0
+#define WRITE	1
+struct iovec iov_rx[VIONET_QUEUE_SIZE];
+struct iovec iov_tx[VIONET_QUEUE_SIZE];
+
+
 __dead void
 vionet_main(int fd, int fd_vmm)
 {
@@ -79,6 +92,10 @@ vionet_main(int fd, int fd_vmm)
 	if (pledge("stdio vmm proc", NULL) == -1)
 		fatal("pledge");

+	/* Initialize iovec arrays. */
+	memset(iov_rx, 0, sizeof(iov_rx));
+	memset(iov_tx, 0, sizeof(iov_tx));
+
 	/* Receive our vionet_dev, mostly preconfigured. */
 	sz = atomicio(read, fd, &dev, sizeof(dev));
 	if (sz != sizeof(dev)) {
@@ -153,6 +170,12 @@ vionet_main(int fd, int fd_vmm)
 		}
 	}

+	/* Initialize our packet injection pipe. */
+	if (pipe2(pipe_inject, O_NONBLOCK) == -1) {
+		log_warn("%s: injection pipe", __func__);
+		goto fail;
+	}
+
 	/* Initialize libevent so we can start wiring event handlers. */
 	event_init();

@@ -171,6 +194,12 @@ vionet_main(int fd, int fd_vmm)
 	event_set(&ev_tap, vionet->data_fd, EV_READ | EV_PERSIST,
 	    vionet_rx_event, &dev);

+	/* Add an event for injected packets. */
+	log_debug("%s: wiring in packet injection handler (fd=%d)", __func__,
+	    pipe_inject[READ]);
+	event_set(&ev_inject, pipe_inject[READ], EV_READ | EV_PERSIST,
+	    vionet_rx_event, &dev);
+
 	/* Configure our sync channel event handler. */
 	log_debug("%s: wiring in sync channel handler (fd=%d)", __func__,
 		dev.sync_fd);
@@ -209,6 +238,8 @@ vionet_main(int fd, int fd_vmm)
 		close_fd(dev.sync_fd);
 		close_fd(dev.async_fd);
 		close_fd(vionet->data_fd);
+		close_fd(pipe_inject[READ]);
+		close_fd(pipe_inject[WRITE]);
 		_exit(ret);
 		/* NOTREACHED */
 	}
@@ -223,6 +254,8 @@ fail:

 	close_fd(dev.sync_fd);
 	close_fd(dev.async_fd);
+	close_fd(pipe_inject[READ]);
+	close_fd(pipe_inject[WRITE]);
 	if (vionet != NULL)
 		close_fd(vionet->data_fd);

@@ -232,7 +265,7 @@ fail:
 /*
  * Update the gpa and hva of the virtqueue.
  */
-void
+static void
 vionet_update_qa(struct vionet_dev *dev)
 {
 	struct virtio_vq_info *vq_info;
@@ -259,7 +292,7 @@ vionet_update_qa(struct vionet_dev *dev)
 /*
  * Update the queue size.
  */
-void
+static void
 vionet_update_qs(struct vionet_dev *dev)
 {
 	struct virtio_vq_info *vq_info;
@@ -280,34 +313,28 @@ vionet_update_qs(struct vionet_dev *dev)
 }

 /*
- * vionet_enq_rx
+ * vionet_rx
  *
- * Take a given packet from the host-side tap and copy it into the guest's
- * buffers utilizing the rx virtio ring. If the packet length is invalid
- * (too small or too large) or if there are not enough buffers available,
- * the packet is dropped.
+ * Pull packet from the provided fd and fill the receive-side virtqueue. We
+ * selectively use zero-copy approaches when possible.
+ *
+ * Returns 1 if guest notification is needed. Otherwise, returns -1 on failure
+ * or 0 if no notification is needed.
  */
-int
-vionet_enq_rx(struct vionet_dev *dev, char *pkt, size_t sz, int *spc)
+static int
+vionet_rx(struct vionet_dev *dev, int fd)
 {
-	uint16_t dxx, idx, hdr_desc_idx, chain_hdr_idx;
+	uint16_t idx, hdr_idx;
 	char *vr = NULL;
-	size_t bufsz = 0, off = 0, pkt_offset = 0, chunk_size = 0;
-	size_t chain_len = 0;
-	struct vring_desc *desc, *pkt_desc, *hdr_desc;
+	size_t chain_len = 0, iov_cnt;
+	struct vring_desc *desc, *table;
 	struct vring_avail *avail;
 	struct vring_used *used;
 	struct virtio_vq_info *vq_info;
-	struct virtio_net_hdr hdr;
-	size_t hdr_sz;
+	struct iovec *iov;
+	int notify = 0;
+	ssize_t sz;

-	if (sz < VIONET_MIN_TXLEN || sz > VIONET_MAX_TXLEN) {
-		log_warnx("%s: invalid packet size", __func__);
-		return (0);
-	}
-
-	hdr_sz = sizeof(hdr);
-
 	if (!(dev->cfg.device_status
 	    & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK)) {
 		log_warnx("%s: driver not ready", __func__);
@@ -315,178 +342,287 @@ vionet_enq_rx(struct vionet_dev *dev, char *pkt, size_
 	}

 	vq_info = &dev->vq[RXQ];
+	idx = vq_info->last_avail;
 	vr = vq_info->q_hva;
 	if (vr == NULL)
 		fatalx("%s: vr == NULL", __func__);

-
 	/* Compute offsets in ring of descriptors, avail ring, and used ring */
-	desc = (struct vring_desc *)(vr);
+	table = (struct vring_desc *)(vr);
 	avail = (struct vring_avail *)(vr + vq_info->vq_availoffset);
 	used = (struct vring_used *)(vr + vq_info->vq_usedoffset);
+	used->flags |= VRING_USED_F_NO_NOTIFY;

-	idx = vq_info->last_avail & VIONET_QUEUE_MASK;
-	if ((vq_info->notified_avail & VIONET_QUEUE_MASK) == idx) {
-		log_debug("%s: insufficient available buffer capacity, "
-		    "dropping packet.", __func__);
-		return (0);
-	}
+	while (idx != avail->idx) {
+		hdr_idx = avail->ring[idx & VIONET_QUEUE_MASK];
+		desc = &table[hdr_idx & VIONET_QUEUE_MASK];
+		if (!DESC_WRITABLE(desc)) {
+			log_warnx("%s: invalid descriptor state", __func__);
+			goto reset;
+		}

-	hdr_desc_idx = avail->ring[idx] & VIONET_QUEUE_MASK;
-	hdr_desc = &desc[hdr_desc_idx];
+		iov = &iov_rx[0];
+		iov_cnt = 1;

-	dxx = hdr_desc_idx;
-	chain_hdr_idx = dxx;
-	chain_len = 0;
+		/*
+		 * First descriptor should be at least as large as the
+		 * virtio_net_hdr. It's not technically required, but in
+		 * legacy devices it should be safe to assume.
+		 */
+		iov->iov_len = desc->len;
+		if (iov->iov_len < sizeof(struct virtio_net_hdr)) {
+			log_warnx("%s: invalid descriptor length", __func__);
+			goto reset;
+		}

-	/* Process the descriptor and walk any potential chain. */
-	do {
-		off = 0;
-		pkt_desc = &desc[dxx];
-		if (!(pkt_desc->flags & VRING_DESC_F_WRITE)) {
-			log_warnx("%s: invalid descriptor, not writable",
+		/*
+		 * Insert the virtio_net_hdr and adjust len/base. We do the
+		 * pointer math here before it's a void*.
+		 */
+		iov->iov_base = hvaddr_mem(desc->addr, iov->iov_len);
+		if (iov->iov_base == NULL)
+			goto reset;
+		memset(iov->iov_base, 0, sizeof(struct virtio_net_hdr));
+
+		/* Tweak the iovec to account for the virtio_net_hdr. */
+		iov->iov_len -= sizeof(struct virtio_net_hdr);
+		iov->iov_base = hvaddr_mem(desc->addr +
+		    sizeof(struct virtio_net_hdr), iov->iov_len);
+		if (iov->iov_base == NULL)
+			goto reset;
+		chain_len = iov->iov_len;
+
+		/*
+		 * Walk the remaining chain and collect remaining addresses
+		 * and lengths.
+		 */
+		while (desc->flags & VRING_DESC_F_NEXT) {
+			desc = &table[desc->next & VIONET_QUEUE_MASK];
+			if (!DESC_WRITABLE(desc)) {
+				log_warnx("%s: invalid descriptor state",
+				    __func__);
+				goto reset;
+			}
+
+			/* Collect our IO information. Translate gpa's. */
+			iov = &iov_rx[iov_cnt];
+			iov->iov_len = desc->len;
+			iov->iov_base = hvaddr_mem(desc->addr, iov->iov_len);
+			if (iov->iov_base == NULL)
+				goto reset;
+			chain_len += iov->iov_len;
+
+			/* Guard against infinitely looping chains. */
+			if (++iov_cnt >= nitems(iov_rx)) {
+				log_warnx("%s: infinite chain detected",
+				    __func__);
+				goto reset;
+			}
+		}
+
+		/* Make sure the driver gave us the bare minimum buffers. */
+		if (chain_len < VIONET_MIN_TXLEN) {
+			log_warnx("%s: insufficient buffers provided",
 			    __func__);
-			return (0);
+			goto reset;
 		}

-		/* How much data do we get to write? */
-		if (sz - bufsz > pkt_desc->len)
-			chunk_size = pkt_desc->len;
+		/*
+		 * If we're enforcing hardware address or handling an injected
+		 * packet, we need to use a copy-based approach.
+		 */
+		if (dev->lockedmac || fd != dev->data_fd)
+			sz = vionet_rx_copy(dev, fd, iov_rx, iov_cnt,
+			    chain_len);
 		else
-			chunk_size = sz - bufsz;
-
-		if (chain_len == 0) {
-			off = hdr_sz;
-			if (chunk_size == pkt_desc->len)
-				chunk_size -= off;
-		}
-
-		/* Write a chunk of data if we need to */
-		if (chunk_size && write_mem(pkt_desc->addr + off,
-			pkt + pkt_offset, chunk_size)) {
-			log_warnx("%s: failed to write to buffer 0x%llx",
-			    __func__, pkt_desc->addr);
-			return (0);
-		}
-
-		chain_len += chunk_size + off;
-		bufsz += chunk_size;
-		pkt_offset += chunk_size;
-
-		dxx = pkt_desc->next & VIONET_QUEUE_MASK;
-	} while (bufsz < sz && pkt_desc->flags & VRING_DESC_F_NEXT);
-
-	/* Move our marker in the ring...*/
-	vq_info->last_avail = (vq_info->last_avail + 1) &
-	    VIONET_QUEUE_MASK;
-
-	/* Prepend the virtio net header in the first buffer. */
-	memset(&hdr, 0, sizeof(hdr));
-	hdr.hdr_len = hdr_sz;
-	if (write_mem(hdr_desc->addr, &hdr, hdr_sz)) {
-	    log_warnx("vionet: rx enq header write_mem error @ 0x%llx",
-		hdr_desc->addr);
-	    return (0);
-	}
-
-	/* Update the index field in the used ring. This must be done last. */
-	dev->cfg.isr_status = 1;
-	*spc = (vq_info->notified_avail - vq_info->last_avail)
-	    & VIONET_QUEUE_MASK;
-
-	/* Update the list of used buffers. */
-	used->ring[used->idx & VIONET_QUEUE_MASK].id = chain_hdr_idx;
-	used->ring[used->idx & VIONET_QUEUE_MASK].len = chain_len;
-	__sync_synchronize();
-	used->idx++;
-
-	return (1);
-}
-
-/*
- * vionet_rx
- *
- * Enqueue data that was received on a tap file descriptor
- * to the vionet device queue.
- */
-static int
-vionet_rx(struct vionet_dev *dev)
-{
-	char buf[PAGE_SIZE];
-	int num_enq = 0, spc = 0;
-	struct ether_header *eh;
-	ssize_t sz;
-
-	do {
-		sz = read(dev->data_fd, buf, sizeof(buf));
-		if (sz == -1) {
-			/*
-			 * If we get EAGAIN, No data is currently available.
-			 * Do not treat this as an error.
-			 */
-			if (errno != EAGAIN)
-				log_warn("%s: read error", __func__);
-		} else if (sz > 0) {
-			eh = (struct ether_header *)buf;
-			if (!dev->lockedmac ||
-			    ETHER_IS_MULTICAST(eh->ether_dhost) ||
-			    memcmp(eh->ether_dhost, dev->mac,
-			    sizeof(eh->ether_dhost)) == 0)
-				num_enq += vionet_enq_rx(dev, buf, sz, &spc);
-		} else if (sz == 0) {
-			log_debug("%s: no data", __func__);
+			sz = vionet_rx_zerocopy(dev, fd, iov_rx, iov_cnt);
+		if (sz == -1)
+			goto reset;
+		if (sz == 0)	/* No packets, so bail out for now. */
 			break;
+
+		/*
+		 * Account for the prefixed header since it wasn't included
+		 * in the copy or zerocopy operations.
+		 */
+		sz += sizeof(struct virtio_net_hdr);
+
+		/* Mark our buffers as used. */
+		used->ring[used->idx & VIONET_QUEUE_MASK].id = hdr_idx;
+		used->ring[used->idx & VIONET_QUEUE_MASK].len = sz;
+		__sync_synchronize();
+		used->idx++;
+		idx++;
+	}
+
+	if (idx != vq_info->last_avail &&
+	    !(avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
+		notify = 1;
+		dev->cfg.isr_status |= 1;
+	}
+
+	vq_info->last_avail = idx;
+	return (notify);
+reset:
+	log_warnx("%s: requesting device reset", __func__);
+	dev->cfg.device_status |= DEVICE_NEEDS_RESET;
+	dev->cfg.isr_status |= VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
+	return (1);
+}
+
+/*
+ * vionet_rx_copy
+ *
+ * Read a packet off the provided file descriptor, validating packet
+ * characteristics, and copy into the provided buffers in the iovec array.
+ *
+ * It's assumed that the provided iovec array contains validated host virtual
+ * address translations and not guest physical addreses.
+ *
+ * Returns number of bytes copied on success, 0 if packet is dropped, and
+ * -1 on an error.
+ */
+ssize_t
+vionet_rx_copy(struct vionet_dev *dev, int fd, const struct iovec *iov,
+    int iov_cnt, size_t chain_len)
+{
+	static uint8_t		 buf[VIONET_HARD_MTU];
+	struct packet		*pkt = NULL;
+	struct ether_header	*eh = NULL;
+	uint8_t			*payload = buf;
+	size_t			 i, chunk, nbytes, copied = 0;
+	ssize_t			 sz;
+
+	/* If reading from the tap(4), try to right-size the read. */
+	if (fd == dev->data_fd)
+		nbytes = MIN(chain_len, VIONET_HARD_MTU);
+	else if (fd == pipe_inject[READ])
+		nbytes = sizeof(struct packet);
+	else {
+		log_warnx("%s: invalid fd: %d", __func__, fd);
+		return (-1);
+	}
+
+	/*
+	 * Try to pull a packet. The fd should be non-blocking and we don't
+	 * care if we under-read (i.e. sz != nbytes) as we may not have a
+	 * packet large enough to fill the buffer.
+	 */
+	sz = read(fd, buf, nbytes);
+	if (sz == -1) {
+		if (errno != EAGAIN) {
+			log_warn("%s: error reading packet", __func__);
+			return (-1);
 		}
-	} while (spc > 0 && sz > 0);
+		return (0);
+	} else if (fd == dev->data_fd && sz < VIONET_MIN_TXLEN) {
+		/* If reading the tap(4), we should get valid ethernet. */
+		log_warnx("%s: invalid packet size", __func__);
+		return (0);
+	} else if (sz != sizeof(struct packet)) {
+		log_warnx("%s: invalid injected packet object", __func__);
+		return (0);
+	}

-	return (num_enq);
+	/* Decompose an injected packet, if that's what we're working with. */
+	if (fd == pipe_inject[READ]) {
+		pkt = (struct packet *)buf;
+		if (pkt->buf == NULL) {
+			log_warnx("%s: invalid injected packet, no buffer",
+			    __func__);
+			return (0);
+		}
+		if (sz < VIONET_MIN_TXLEN || sz > VIONET_MAX_TXLEN) {
+			log_warnx("%s: invalid injected packet size", __func__);
+			goto drop;
+		}
+		payload = pkt->buf;
+		sz = (ssize_t)pkt->len;
+	}
+
+	/* Validate the ethernet header, if required. */
+	if (dev->lockedmac) {
+		eh = (struct ether_header *)(payload);
+		if (!ETHER_IS_MULTICAST(eh->ether_dhost) &&
+		    memcmp(eh->ether_dhost, dev->mac,
+		    sizeof(eh->ether_dhost)) != 0)
+			goto drop;
+	}
+
+	/* Truncate one last time to the chain length, if shorter. */
+	sz = MIN(chain_len, (size_t)sz);
+
+	/*
+	 * Copy the packet into the provided buffers. We can use memcpy(3)
+	 * here as the gpa was validated and translated to an hva previously.
+	 */
+	for (i = 0; (int)i < iov_cnt && (size_t)sz > copied; i++) {
+		chunk = MIN(iov[i].iov_len, (size_t)(sz - copied));
+		memcpy(iov[i].iov_base, payload + copied, chunk);
+		copied += chunk;
+	}
+
+drop:
+	/* Free any injected packet buffer. */
+	if (pkt != NULL)
+		free(pkt->buf);
+
+	return (copied);
 }

 /*
+ * vionet_rx_zerocopy
+ *
+ * Perform a vectorized read from the given fd into the guest physical memory
+ * pointed to by iovecs.
+ *
+ * Returns number of bytes read on success, -1 on error, or 0 if EAGAIN was
+ * returned by readv.
+ *
+ */
+static ssize_t
+vionet_rx_zerocopy(struct vionet_dev *dev, int fd, const struct iovec *iov,
+    int iov_cnt)
+{
+	ssize_t		sz;
+
+	if (dev->lockedmac) {
+		log_warnx("%s: zerocopy not available for locked lladdr",
+		    __func__);
+		return (-1);
+	}
+
+	sz = readv(fd, iov, iov_cnt);
+	if (sz == -1 && errno == EAGAIN)
+		return (0);
+	return (sz);
+}
+
+
+/*
  * vionet_rx_event
  *
  * Called when new data can be received on the tap fd of a vionet device.
  */
 static void
-vionet_rx_event(int fd, short kind, void *arg)
+vionet_rx_event(int fd, short event, void *arg)
 {
 	struct virtio_dev *dev = (struct virtio_dev *)arg;

-	if (vionet_rx(&dev->vionet) > 0)
+	if (!(event & EV_READ))
+		fatalx("%s: invalid event type", __func__);
+
+	if (vionet_rx(&dev->vionet, fd) > 0)
 		virtio_assert_pic_irq(dev, 0);
 }

-void
-vionet_notify_rx(struct virtio_dev *dev)
-{
-	struct vionet_dev *vionet = &dev->vionet;
-	struct vring_avail *avail;
-	struct virtio_vq_info *vq_info;
-	char *vr;
-
-	vq_info = &vionet->vq[RXQ];
-	vr = vq_info->q_hva;
-	if (vr == NULL)
-		fatalx("%s: vr == NULL", __func__);
-
-	/* Compute offset into avail ring */
-	avail = (struct vring_avail *)(vr + vq_info->vq_availoffset);
-	vq_info->notified_avail = avail->idx - 1;
-}
-
-int
+static int
 vionet_notifyq(struct virtio_dev *dev)
 {
 	struct vionet_dev *vionet = &dev->vionet;
-	int ret = 0;

 	switch (vionet->cfg.queue_notify) {
-	case RXQ:
-		vionet_notify_rx(dev);
-		break;
-	case TXQ:
-		ret = vionet_notify_tx(dev);
-		break;
+	case TXQ: return vionet_notify_tx(dev);
 	default:
 		/*
 		 * Catch the unimplemented queue ID 2 (control queue) as
@@ -497,23 +633,25 @@ vionet_notifyq(struct virtio_dev *dev)
 		break;
 	}

-	return (ret);
+	return (0);
 }

-int
+static int
 vionet_notify_tx(struct virtio_dev *dev)
 {
-	uint16_t idx, pkt_desc_idx, hdr_desc_idx, dxx, cnt;
-	size_t pktsz, chunk_size = 0;
-	ssize_t dhcpsz = 0;
-	int num_enq, ofs, spc = 0;
-	char *vr = NULL, *pkt = NULL, *dhcppkt = NULL;
+	uint16_t idx, hdr_idx;
+	size_t chain_len, iov_cnt;
+	ssize_t dhcpsz = 0, sz;
+	int notify = 0;
+	char *vr = NULL, *dhcppkt = NULL;
 	struct vionet_dev *vionet = &dev->vionet;
-	struct vring_desc *desc, *pkt_desc, *hdr_desc;
+	struct vring_desc *desc, *table;
 	struct vring_avail *avail;
 	struct vring_used *used;
 	struct virtio_vq_info *vq_info;
 	struct ether_header *eh;
+	struct iovec *iov;
+	struct packet pkt;

 	if (!(vionet->cfg.device_status
 	    & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK)) {
@@ -522,159 +660,161 @@ vionet_notify_tx(struct virtio_dev *dev)
 	}

 	vq_info = &vionet->vq[TXQ];
+	idx = vq_info->last_avail;
 	vr = vq_info->q_hva;
 	if (vr == NULL)
 		fatalx("%s: vr == NULL", __func__);

 	/* Compute offsets in ring of descriptors, avail ring, and used ring */
-	desc = (struct vring_desc *)(vr);
+	table = (struct vring_desc *)(vr);
 	avail = (struct vring_avail *)(vr + vq_info->vq_availoffset);
 	used = (struct vring_used *)(vr + vq_info->vq_usedoffset);

-	num_enq = 0;
+	while (idx != avail->idx) {
+		hdr_idx = avail->ring[idx & VIONET_QUEUE_MASK];
+		desc = &table[hdr_idx & VIONET_QUEUE_MASK];
+		if (DESC_WRITABLE(desc)) {
+			log_warnx("%s: invalid descriptor state", __func__);
+			goto reset;
+		}

-	idx = vq_info->last_avail & VIONET_QUEUE_MASK;
+		iov = &iov_tx[0];
+		iov_cnt = 0;
+		chain_len = 0;

-	if ((avail->idx & VIONET_QUEUE_MASK) == idx)
-		return (0);
+		/*
+		 * As a legacy device, we most likely will receive a lead
+		 * descriptor sized to the virtio_net_hdr. However, the framing
+		 * is not guaranteed, so check for packet data.
+		 */
+		iov->iov_len = desc->len;
+		if (iov->iov_len < sizeof(struct virtio_net_hdr)) {
+			log_warnx("%s: invalid descriptor length", __func__);
+			goto reset;
+		} else if (iov->iov_len > sizeof(struct virtio_net_hdr)) {
+			/* Chop off the virtio header, leaving packet data. */
+			iov->iov_len -= sizeof(struct virtio_net_hdr);
+			chain_len += iov->iov_len;
+			iov->iov_base = hvaddr_mem(desc->addr +
+			    sizeof(struct virtio_net_hdr), iov->iov_len);
+			if (iov->iov_base == NULL)
+				goto reset;
+			iov_cnt++;
+		}

-	while ((avail->idx & VIONET_QUEUE_MASK) != idx) {
-		hdr_desc_idx = avail->ring[idx] & VIONET_QUEUE_MASK;
-		hdr_desc = &desc[hdr_desc_idx];
-		pktsz = 0;
-
-		cnt = 0;
-		dxx = hdr_desc_idx;
-		do {
-			pktsz += desc[dxx].len;
-			dxx = desc[dxx].next & VIONET_QUEUE_MASK;
-
-			/*
-			 * Virtio 1.0, cs04, section 2.4.5:
-			 *  "The number of descriptors in the table is defined
-			 *   by the queue size for this virtqueue: this is the
-			 *   maximum possible descriptor chain length."
-			 */
-			if (++cnt >= VIONET_QUEUE_SIZE) {
-				log_warnx("%s: descriptor table invalid",
+		/*
+		 * Walk the chain and collect remaining addresses and lengths.
+		 */
+		while (desc->flags & VRING_DESC_F_NEXT) {
+			desc = &table[desc->next & VIONET_QUEUE_MASK];
+			if (DESC_WRITABLE(desc)) {
+				log_warnx("%s: invalid descriptor state",
 				    __func__);
-				goto out;
+				goto reset;
 			}
-		} while (desc[dxx].flags & VRING_DESC_F_NEXT);

-		pktsz += desc[dxx].len;
+			/* Collect our IO information, translating gpa's. */
+			iov = &iov_tx[iov_cnt];
+			iov->iov_len = desc->len;
+			iov->iov_base = hvaddr_mem(desc->addr, iov->iov_len);
+			if (iov->iov_base == NULL)
+				goto reset;
+			chain_len += iov->iov_len;

-		/* Remove virtio header descriptor len */
-		pktsz -= hdr_desc->len;
-
-		/* Drop packets violating device MTU-based limits */
-		if (pktsz < VIONET_MIN_TXLEN || pktsz > VIONET_MAX_TXLEN) {
-			log_warnx("%s: invalid packet size %lu", __func__,
-			    pktsz);
-			goto drop_packet;
-		}
-		pkt = malloc(pktsz);
-		if (pkt == NULL) {
-			log_warn("malloc error alloc packet buf");
-			goto out;
-		}
-
-		ofs = 0;
-		pkt_desc_idx = hdr_desc->next & VIONET_QUEUE_MASK;
-		pkt_desc = &desc[pkt_desc_idx];
-
-		while (pkt_desc->flags & VRING_DESC_F_NEXT) {
-			/* must be not writable */
-			if (pkt_desc->flags & VRING_DESC_F_WRITE) {
-				log_warnx("unexpected writable tx desc "
-				    "%d", pkt_desc_idx);
-				goto out;
-			}
-
-			/* Check we don't read beyond allocated pktsz */
-			if (pkt_desc->len > pktsz - ofs) {
-				log_warnx("%s: descriptor len past pkt len",
+			/* Guard against infinitely looping chains. */
+			if (++iov_cnt >= nitems(iov_tx)) {
+				log_warnx("%s: infinite chain detected",
 				    __func__);
-				chunk_size = pktsz - ofs;
-			} else
-				chunk_size = pkt_desc->len;
-
-			/* Read packet from descriptor ring */
-			if (read_mem(pkt_desc->addr, pkt + ofs, chunk_size)) {
-				log_warnx("vionet: packet read_mem error "
-				    "@ 0x%llx", pkt_desc->addr);
-				goto out;
+				goto reset;
 			}
-
-			ofs += pkt_desc->len;
-			pkt_desc_idx = pkt_desc->next & VIONET_QUEUE_MASK;
-			pkt_desc = &desc[pkt_desc_idx];
 		}

-		/* Now handle tail descriptor - must be not writable */
-		if (pkt_desc->flags & VRING_DESC_F_WRITE) {
-			log_warnx("unexpected writable tx descriptor %d",
-			    pkt_desc_idx);
-			goto out;
+		/* Check if we've got a minimum viable amount of data. */
+		if (chain_len < VIONET_MIN_TXLEN) {
+			sz = chain_len;
+			goto drop;
 		}

-		/* Check we don't read beyond allocated pktsz */
-		if (pkt_desc->len > pktsz - ofs) {
-			log_warnx("%s: descriptor len past pkt len", __func__);
-			chunk_size = pktsz - ofs - pkt_desc->len;
-		} else
-			chunk_size = pkt_desc->len;
-
-		/* Read packet from descriptor ring */
-		if (read_mem(pkt_desc->addr, pkt + ofs, chunk_size)) {
-			log_warnx("vionet: packet read_mem error @ "
-			    "0x%llx", pkt_desc->addr);
-			goto out;
+		/*
+		 * Packet inspection for ethernet header (if using a "local"
+		 * interface) for possibility of a DHCP packet or (if using
+		 * locked lladdr) for validating ethernet header.
+		 *
+		 * To help preserve zero-copy semantics, we require the first
+		 * descriptor with packet data contains a large enough buffer
+		 * for this inspection.
+		 */
+		iov = &iov_tx[0];
+		if (vionet->lockedmac) {
+			if (iov->iov_len < ETHER_HDR_LEN) {
+				log_warnx("%s: insufficient header data",
+				    __func__);
+				goto drop;
+			}
+			eh = (struct ether_header *)iov->iov_base;
+			if (memcmp(eh->ether_shost, vionet->mac,
+			    sizeof(eh->ether_shost)) != 0) {
+				log_warnx("%s: bad source address %s",
+				    __func__, ether_ntoa((struct ether_addr *)
+					eh->ether_shost));
+				sz = chain_len;
+				goto drop;
+			}
 		}
-
-		/* reject other source addresses */
-		if (vionet->lockedmac && pktsz >= ETHER_HDR_LEN &&
-		    (eh = (struct ether_header *)pkt) &&
-		    memcmp(eh->ether_shost, vionet->mac,
-		    sizeof(eh->ether_shost)) != 0)
-			log_debug("vionet: wrong source address %s for vm %d",
-			    ether_ntoa((struct ether_addr *)
-			    eh->ether_shost), dev->vm_id);
-		else if (vionet->local &&
-		    (dhcpsz = dhcp_request(dev, pkt, pktsz, &dhcppkt)) != -1) {
-			log_debug("vionet: dhcp request,"
-			    " local response size %zd", dhcpsz);
-
-		/* XXX signed vs unsigned here, funky cast */
-		} else if (write(vionet->data_fd, pkt, pktsz) != (int)pktsz) {
-			log_warnx("vionet: tx failed writing to tap: "
-			    "%d", errno);
-			goto out;
+		if (vionet->local) {
+			dhcpsz = dhcp_request(dev, iov->iov_base, iov->iov_len,
+			    &dhcppkt);
+			log_debug("%s: detected dhcp request of %zu bytes",
+			    __func__, dhcpsz);
 		}

-	drop_packet:
-		vionet->cfg.isr_status = 1;
-		used->ring[used->idx & VIONET_QUEUE_MASK].id = hdr_desc_idx;
-		used->ring[used->idx & VIONET_QUEUE_MASK].len = hdr_desc->len;
+		/* Write our packet to the tap(4). */
+		sz = writev(vionet->data_fd, iov_tx, iov_cnt);
+		if (sz == -1 && errno != ENOBUFS) {
+			log_warn("%s", __func__);
+			goto reset;
+		}
+		sz += sizeof(struct virtio_net_hdr);
+drop:
+		used->ring[used->idx & VIONET_QUEUE_MASK].id = hdr_idx;
+		used->ring[used->idx & VIONET_QUEUE_MASK].len = sz;
 		__sync_synchronize();
 		used->idx++;
+		idx++;

-		vq_info->last_avail = avail->idx & VIONET_QUEUE_MASK;
-		idx = (idx + 1) & VIONET_QUEUE_MASK;
+		/* Facilitate DHCP reply injection, if needed. */
+		if (dhcpsz > 0) {
+			pkt.buf = dhcppkt;
+			pkt.len = dhcpsz;
+			sz = write(pipe_inject[WRITE], &pkt, sizeof(pkt));
+			if (sz == -1 && errno != EAGAIN) {
+				log_warn("%s: packet injection", __func__);
+				free(pkt.buf);
+			} else if (sz == -1 && errno == EAGAIN) {
+				log_debug("%s: dropping dhcp reply", __func__);
+				free(pkt.buf);
+			} else if (sz != sizeof(pkt)) {
+				log_warnx("%s: failed packet injection",
+				    __func__);
+				free(pkt.buf);
+			}
+			log_debug("%s: injected dhcp reply with %ld bytes",
+			    __func__, sz);
+		}
+	}

-		num_enq++;
-
-		free(pkt);
-		pkt = NULL;
+	if (idx != vq_info->last_avail &&
+	    !(avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
+		notify = 1;
+		vionet->cfg.isr_status |= 1;
 	}

-	if (dhcpsz > 0)
-		vionet_enq_rx(vionet, dhcppkt, dhcpsz, &spc);
-
-out:
-	free(pkt);
-	free(dhcppkt);
-
+	vq_info->last_avail = idx;
+	return (notify);
+reset:
+	log_warnx("%s: requesting device reset", __func__);
+	vionet->cfg.device_status |= DEVICE_NEEDS_RESET;
+	vionet->cfg.isr_status |= VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
 	return (1);
 }

@@ -732,12 +872,15 @@ dev_dispatch_vm(int fd, short event, void *arg)
 		case IMSG_VMDOP_PAUSE_VM:
 			log_debug("%s: pausing", __func__);
 			event_del(&ev_tap);
+			event_del(&ev_inject);
 			break;
 		case IMSG_VMDOP_UNPAUSE_VM:
 			log_debug("%s: unpausing", __func__);
 			if (vionet->cfg.device_status
-			    & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK)
+			    & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) {
 				event_add(&ev_tap, NULL);
+				event_add(&ev_inject, NULL);
+			}
 			break;
 		case IMSG_CTL_VERBOSE:
 			IMSG_SIZE_CHECK(&imsg, &verbose);
@@ -825,6 +968,7 @@ handle_sync_io(int fd, short event, void *arg)
 		case VIODEV_MSG_SHUTDOWN:
 			event_del(&dev->sync_iev.ev);
 			event_del(&ev_tap);
+			event_del(&ev_inject);
 			event_loopbreak();
 			return;
 		default:
@@ -885,11 +1029,11 @@ handle_io_write(struct viodev_msg *msg, struct virtio_
 			virtio_deassert_pic_irq(dev, msg->vcpu);
 		}
 		event_del(&ev_tap);
+		event_del(&ev_inject);
 		if (vionet->cfg.device_status
 		    & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) {
-			if (event_add(&ev_tap, NULL))
-				log_warn("%s: could not initialize virtio tap "
-				    "event handler", __func__);
+			event_add(&ev_tap, NULL);
+			event_add(&ev_inject, NULL);
 		}
 		break;
 	default:
blob - 23608635c85d151186c39ea3c72e75f8a9eb7f68
blob + 2593643d6fdbc9109c8462ef661169b6a245a46e
--- usr.sbin/vmd/virtio.h
+++ usr.sbin/vmd/virtio.h
@@ -366,13 +366,6 @@ int vioblk_restore(int, struct vmd_vm *, int[][VM_MAX_

 int vionet_dump(int);
 int vionet_restore(int, struct vmd_vm *, int *);
-void vionet_update_qs(struct vionet_dev *);
-void vionet_update_qa(struct vionet_dev *);
-int vionet_notifyq(struct virtio_dev *);
-void vionet_notify_rx(struct virtio_dev *);
-int vionet_notify_tx(struct virtio_dev *);
-void vionet_process_rx(uint32_t);
-int vionet_enq_rx(struct vionet_dev *, char *, size_t, int *);
 void vionet_set_hostmac(struct vmd_vm *, unsigned int, uint8_t *);

 int vmmci_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t);

[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic