On Thu, Jul 10, 2025 at 5:05 PM Eugenio Perez Martin <eperezma@xxxxxxxxxx> wrote: > > On Tue, Jul 8, 2025 at 8:48 AM Jason Wang <jasowang@xxxxxxxxxx> wrote: > > > > This patch adds basic in order support for vhost. Two optimizations > > are implemented in this patch: > > > > 1) Since driver uses descriptor in order, vhost can deduce the next > > avail ring head by counting the number of descriptors that has been > > used in next_avail_head. This eliminate the need to access the > > available ring in vhost. > > > > 2) vhost_add_used_and_singal_n() is extended to accept the number of > > batched buffers per used elem. While this increases the times of > > usersapce memory access but it helps to reduce the chance of > > s/usersapce/userspace/ > > > used ring access of both the driver and vhost. > > > > Vhost-net will be the first user for this. > > > > Signed-off-by: Jason Wang <jasowang@xxxxxxxxxx> > > --- > > drivers/vhost/net.c | 6 ++- > > drivers/vhost/vhost.c | 121 +++++++++++++++++++++++++++++++++++------- > > drivers/vhost/vhost.h | 8 ++- > > 3 files changed, 111 insertions(+), 24 deletions(-) > > > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c > > index 7cbfc7d718b3..4f9c67f17b49 100644 > > --- a/drivers/vhost/net.c > > +++ b/drivers/vhost/net.c > > @@ -374,7 +374,8 @@ static void vhost_zerocopy_signal_used(struct vhost_net *net, > > while (j) { > > add = min(UIO_MAXIOV - nvq->done_idx, j); > > vhost_add_used_and_signal_n(vq->dev, vq, > > - &vq->heads[nvq->done_idx], add); > > + &vq->heads[nvq->done_idx], > > + NULL, add); > > nvq->done_idx = (nvq->done_idx + add) % UIO_MAXIOV; > > j -= add; > > } > > @@ -457,7 +458,8 @@ static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq) > > if (!nvq->done_idx) > > return; > > > > - vhost_add_used_and_signal_n(dev, vq, vq->heads, nvq->done_idx); > > + vhost_add_used_and_signal_n(dev, vq, vq->heads, NULL, > > + nvq->done_idx); > > nvq->done_idx = 0; > > } > > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > > index 3a5ebb973dba..c7ed069fc49e 100644 > > --- a/drivers/vhost/vhost.c > > +++ b/drivers/vhost/vhost.c > > @@ -364,6 +364,7 @@ static void vhost_vq_reset(struct vhost_dev *dev, > > vq->avail = NULL; > > vq->used = NULL; > > vq->last_avail_idx = 0; > > + vq->next_avail_head = 0; > > vq->avail_idx = 0; > > vq->last_used_idx = 0; > > vq->signalled_used = 0; > > @@ -455,6 +456,8 @@ static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq) > > vq->log = NULL; > > kfree(vq->heads); > > vq->heads = NULL; > > + kfree(vq->nheads); > > + vq->nheads = NULL; > > } > > > > /* Helper to allocate iovec buffers for all vqs. */ > > @@ -472,7 +475,9 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev) > > GFP_KERNEL); > > vq->heads = kmalloc_array(dev->iov_limit, sizeof(*vq->heads), > > GFP_KERNEL); > > - if (!vq->indirect || !vq->log || !vq->heads) > > + vq->nheads = kmalloc_array(dev->iov_limit, sizeof(*vq->nheads), > > + GFP_KERNEL); > > + if (!vq->indirect || !vq->log || !vq->heads || !vq->nheads) > > goto err_nomem; > > } > > return 0; > > @@ -1990,14 +1995,15 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg > > break; > > } > > if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) { > > - vq->last_avail_idx = s.num & 0xffff; > > + vq->next_avail_head = vq->last_avail_idx = > > + s.num & 0xffff; > > vq->last_used_idx = (s.num >> 16) & 0xffff; > > } else { > > if (s.num > 0xffff) { > > r = -EINVAL; > > break; > > } > > - vq->last_avail_idx = s.num; > > + vq->next_avail_head = vq->last_avail_idx = s.num; > > Why not just reuse last_avail_idx instead of creating next_avail_head? > > At first glance it seemed to me that it was done this way to support > rewinding, but in_order path will happily reuse next_avail_head > without checking for last_avail_idx except for checking if the ring is > empty. Am I missing something? Because the driver can submit a batch of available buffers so last_avail_idx is not necessarily equal to next_avail_head. > > > } > > /* Forget the cached index value. */ > > vq->avail_idx = vq->last_avail_idx; > > @@ -2590,11 +2596,12 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, > > unsigned int *out_num, unsigned int *in_num, > > struct vhost_log *log, unsigned int *log_num) > > { > > + bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER); > > struct vring_desc desc; > > unsigned int i, head, found = 0; > > u16 last_avail_idx = vq->last_avail_idx; > > __virtio16 ring_head; > > - int ret, access; > > + int ret, access, c = 0; > > > > if (vq->avail_idx == vq->last_avail_idx) { > > ret = vhost_get_avail_idx(vq); > > @@ -2605,17 +2612,21 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, > > return vq->num; > > } > > > > - /* Grab the next descriptor number they're advertising, and increment > > - * the index we've seen. */ > > - if (unlikely(vhost_get_avail_head(vq, &ring_head, last_avail_idx))) { > > - vq_err(vq, "Failed to read head: idx %d address %p\n", > > - last_avail_idx, > > - &vq->avail->ring[last_avail_idx % vq->num]); > > - return -EFAULT; > > + if (in_order) > > + head = vq->next_avail_head & (vq->num - 1); > > + else { > > + /* Grab the next descriptor number they're > > + * advertising, and increment the index we've seen. */ > > + if (unlikely(vhost_get_avail_head(vq, &ring_head, > > + last_avail_idx))) { > > + vq_err(vq, "Failed to read head: idx %d address %p\n", > > + last_avail_idx, > > + &vq->avail->ring[last_avail_idx % vq->num]); > > + return -EFAULT; > > + } > > + head = vhost16_to_cpu(vq, ring_head); > > } > > > > - head = vhost16_to_cpu(vq, ring_head); > > - > > /* If their number is silly, that's an error. */ > > if (unlikely(head >= vq->num)) { > > vq_err(vq, "Guest says index %u > %u is available", > > @@ -2658,6 +2669,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, > > "in indirect descriptor at idx %d\n", i); > > return ret; > > } > > + ++c; > > continue; > > } > > > > @@ -2693,10 +2705,12 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, > > } > > *out_num += ret; > > } > > + ++c; > > } while ((i = next_desc(vq, &desc)) != -1); > > > > /* On success, increment avail index. */ > > vq->last_avail_idx++; > > + vq->next_avail_head += c; > > > > /* Assume notifications from guest are disabled at this point, > > * if they aren't we would need to update avail_event index. */ > > @@ -2720,8 +2734,9 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len) > > cpu_to_vhost32(vq, head), > > cpu_to_vhost32(vq, len) > > }; > > + u16 nheads = 1; > > > > - return vhost_add_used_n(vq, &heads, 1); > > + return vhost_add_used_n(vq, &heads, &nheads, 1); > > } > > EXPORT_SYMBOL_GPL(vhost_add_used); > > > > @@ -2757,10 +2772,10 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq, > > return 0; > > } > > > > -/* After we've used one of their buffers, we tell them about it. We'll then > > - * want to notify the guest, using eventfd. */ > > -int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads, > > - unsigned count) > > +static int vhost_add_used_n_ooo(struct vhost_virtqueue *vq, > > + struct vring_used_elem *heads, > > + u16 *nheads, > > + unsigned count) > > nheads is not used in this function and it is checked to be NULL in > the caller, should we remove it from the parameter list? Exactly. > > > { > > int start, n, r; > > > > @@ -2775,6 +2790,70 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads, > > } > > r = __vhost_add_used_n(vq, heads, count); > > > > + return r; > > Nit: We can merge with the previous statement and do "return > __vhost_add_used_n(vq, heads, count);" Right. > > > +} > > + > > +static int vhost_add_used_n_in_order(struct vhost_virtqueue *vq, > > + struct vring_used_elem *heads, > > + u16 *nheads, > > Nit: we can const-ify nheads, and do the same for _in_order variant > and vhost_add_used_n. Actually we can do it with heads too but it > requires more changes to existing code. I think it would be nice to > constify *nheads if you need to respin. Let me do that. > > > + unsigned count) > > +{ > > + vring_used_elem_t __user *used; > > + u16 old, new = vq->last_used_idx; > > + int start, i; > > + > > + if (!nheads) > > + return -EINVAL; > > + > > + start = vq->last_used_idx & (vq->num - 1); > > + used = vq->used->ring + start; > > + > > + for (i = 0; i < count; i++) { > > + if (vhost_put_used(vq, &heads[i], start, 1)) { > > + vq_err(vq, "Failed to write used"); > > + return -EFAULT; > > + } > > + start += nheads[i]; > > + new += nheads[i]; > > + if (start >= vq->num) > > + start -= vq->num; > > + } > > + > > + if (unlikely(vq->log_used)) { > > + /* Make sure data is seen before log. */ > > + smp_wmb(); > > + /* Log used ring entry write. */ > > + log_used(vq, ((void __user *)used - (void __user *)vq->used), > > + (vq->num - start) * sizeof *used); > > + if (start + count > vq->num) > > + log_used(vq, 0, > > + (start + count - vq->num) * sizeof *used); > > + } > > + > > + old = vq->last_used_idx; > > + vq->last_used_idx = new; > > + /* If the driver never bothers to signal in a very long while, > > + * used index might wrap around. If that happens, invalidate > > + * signalled_used index we stored. TODO: make sure driver > > + * signals at least once in 2^16 and remove this. */ > > + if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old))) > > + vq->signalled_used_valid = false; > > + return 0; > > +} > > + > > +/* After we've used one of their buffers, we tell them about it. We'll then > > + * want to notify the guest, using eventfd. */ > > +int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads, > > + u16 *nheads, unsigned count) > > +{ > > + bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER); > > + int r; > > + > > + if (!in_order || !nheads) > > + r = vhost_add_used_n_ooo(vq, heads, nheads, count); > > + else > > + r = vhost_add_used_n_in_order(vq, heads, nheads, count); > > + > > I just realized the original code didn't do it either, but we should > return if r < 0 here. Otherwise, used->ring[] has a random value and > used->idx is incremented covering these values. This should be > triggable in a guest that set used->idx valid but used->ring[] > invalid, for example. This looks like a bug, I will send an independent fix. > > > /* Make sure buffer is written before we update index. */ > > smp_wmb(); > > if (vhost_put_used_idx(vq)) { > > @@ -2853,9 +2932,11 @@ EXPORT_SYMBOL_GPL(vhost_add_used_and_signal); > > /* multi-buffer version of vhost_add_used_and_signal */ > > void vhost_add_used_and_signal_n(struct vhost_dev *dev, > > struct vhost_virtqueue *vq, > > - struct vring_used_elem *heads, unsigned count) > > + struct vring_used_elem *heads, > > + u16 *nheads, > > + unsigned count) > > { > > - vhost_add_used_n(vq, heads, count); > > + vhost_add_used_n(vq, heads, nheads, count); > > vhost_signal(dev, vq); > > } > > EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n); > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h > > index bb75a292d50c..dca9f309d396 100644 > > --- a/drivers/vhost/vhost.h > > +++ b/drivers/vhost/vhost.h > > @@ -103,6 +103,8 @@ struct vhost_virtqueue { > > * Values are limited to 0x7fff, and the high bit is used as > > * a wrap counter when using VIRTIO_F_RING_PACKED. */ > > u16 last_avail_idx; > > + /* Next avail ring head when VIRTIO_F_IN_ORDER is neogitated */ > > s/neogitated/negotiated/ Will fix it. > > > + u16 next_avail_head; > > > > /* Caches available index value from user. */ > > u16 avail_idx; > > @@ -129,6 +131,7 @@ struct vhost_virtqueue { > > struct iovec iotlb_iov[64]; > > struct iovec *indirect; > > struct vring_used_elem *heads; > > + u16 *nheads; > > /* Protected by virtqueue mutex. */ > > struct vhost_iotlb *umem; > > struct vhost_iotlb *iotlb; > > @@ -213,11 +216,12 @@ bool vhost_vq_is_setup(struct vhost_virtqueue *vq); > > int vhost_vq_init_access(struct vhost_virtqueue *); > > int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len); > > int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads, > > - unsigned count); > > + u16 *nheads, unsigned count); > > void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *, > > unsigned int id, int len); > > void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *, > > - struct vring_used_elem *heads, unsigned count); > > + struct vring_used_elem *heads, u16 *nheads, > > + unsigned count); > > void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *); > > void vhost_disable_notify(struct vhost_dev *, struct vhost_virtqueue *); > > bool vhost_vq_avail_empty(struct vhost_dev *, struct vhost_virtqueue *); > > -- > > 2.31.1 > > > Thanks