diff mbox series

[RFC] vhost/vsock: Add vsock_list file to map cid with vhost tasks

Message ID 20210505163855.32dad8e7@gandalf.local.home
State New
Headers show
Series [RFC] vhost/vsock: Add vsock_list file to map cid with vhost tasks | expand

Commit Message

Steven Rostedt May 5, 2021, 8:38 p.m. UTC
The new trace-cmd 3.0 (which is almost ready to be released) allows for
tracing between host and guests with timestamp synchronization such that
the events on the host and the guest can be interleaved in the proper order
that they occur. KernelShark now has a plugin that visualizes this
interaction.

The implementation requires that the guest has a vsock CID assigned, and on
the guest a "trace-cmd agent" is running, that will listen on a port for
the CID. The on the host a "trace-cmd record -A guest@cid:port -e events"
can be called and the host will connect to the guest agent through the
cid/port pair and have the agent enable tracing on behalf of the host and
send the trace data back down to it.

The problem is that there is no sure fire way to find the CID for a guest.
Currently, the user must know the cid, or we have a hack that looks for the
qemu process and parses the --guest-cid parameter from it. But this is
prone to error and does not work on other implementation (was told that
crosvm does not use qemu).

As I can not find a way to discover CIDs assigned to guests via any kernel
interface, I decided to create this one. Note, I'm not attached to it. If
there's a better way to do this, I would love to have it. But since I'm not
an expert in the networking layer nor virtio, I decided to stick to what I
know and add a debugfs interface that simply lists all the registered CIDs
and the worker task that they are associated with. The worker task at
least has the PID of the task it represents.

Now I can find the cid / host process in charge of the guest pair:

  # cat /sys/kernel/debug/vsock_list
  3	vhost-1954:2002

  # ps aux | grep 1954
  qemu        1954  9.9 21.3 1629092 796148 ?      Sl   16:22   0:58  /usr/bin/qemu-kvm -name guest=Fedora21,debug-threads=on -S -object secret,id=masterKey0,format=raw,file=/var/lib/libvirt/qemu/domain-1-Fedora21/master-key.aes -machine pc-1.2,accel=kvm,usb=off,dump-guest-core=off -cpu qemu64 -m 1000 -overcommit mem-lock=off -smp 2,sockets=2,cores=1,threads=1 -uuid 1eefeeb0-3ac7-07c1-926e-236908313b4c -no-user-config -nodefaults -chardev socket,id=charmonitor,fd=32,server,nowait -mon chardev=charmonitor,id=monitor,mode=control -rtc base=utc -no-shutdown -boot strict=on -device piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2 -device virtio-serial-pci,id=virtio-serial0,bus=pci.0,addr=0x6 -blockdev {"driver":"host_device","filename":"/dev/mapper/vg_bxtest-GuestFedora","node-name":"libvirt-1-storage","auto-read-only":true,"discard":"unmap"} -blockdev {"node-name":"libvirt-1-format","read-only":false,"driver":"raw","file":"libvirt-1-storage"} -device ide-hd,bus=ide.0,unit=0,drive=libvirt-1-
 format,id=ide0-0-0,bootindex=1 -netdev tap,fd=34,id=hostnet0 -device rtl8139,netdev=hostnet0,id=net0,mac=52:54:00:9f:e9:d5,bus=pci.0,addr=0x3 -netdev tap,fd=35,id=hostnet1 -device virtio-net-pci,netdev=hostnet1,id=net1,mac=52:54:00:ec:dc:6e,bus=pci.0,addr=0x5 -chardev pty,id=charserial0 -device isa-serial,chardev=charserial0,id=serial0 -chardev pipe,id=charchannel0,path=/var/lib/trace-cmd/virt/Fedora21/trace-pipe-cpu0 -device virtserialport,bus=virtio-serial0.0,nr=1,chardev=charchannel0,id=channel0,name=trace-pipe-cpu0 -chardev pipe,id=charchannel1,path=/var/lib/trace-cmd/virt/Fedora21/trace-pipe-cpu1 -device virtserialport,bus=virtio-serial0.0,nr=2,chardev=charchannel1,id=channel1,name=trace-pipe-cpu1 -vnc 127.0.0.1:0 -device cirrus-vga,id=video0,bus=pci.0,addr=0x2 -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x4 -sandbox on,obsolete=deny,elevateprivileges=deny,spawn=deny,resourcecontrol=deny -device vhost-vsock-pci,id=vsock0,guest-cid=3,vhostfd=16,bus=pci.0,addr=0x7 -msg 
 timestamp=on
  root        2000  0.0  0.0      0     0 ?        S    16:22   0:00 [kvm-pit/1954]
  root        2002  0.0  0.0      0     0 ?        S    16:22   0:00 [vhost-1954]


This is just an example of what I'm looking for. Just a way to find what
process is using what cid.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---

Comments

Steven Rostedt May 6, 2021, 1:03 a.m. UTC | #1
For kicks, I wrote this program that uses libtracefs to search all CIDS
(1-255), and find the kvm guests that are attached to them.

It traces the sched_wakeup and kvm_exit, looking for:

 this_task -> wakeup -> wakeup -> kvm_exit

when doing a connect to a cid.

When it finds the pid that did a kvm_exit, it knows that's the PID that
is woken by the vhost worker task. It's a little slow, and I would
really like a better way to do this, but it's at least an option that
is available now.

-- Steve
#define _GNU_SOURCE
#include <asm/types.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <linux/vm_sockets.h>

#include <tracefs.h>

#define MAX_CID		256

static int this_pid;

static int open_vsock(unsigned int cid, unsigned int port)
{
	struct sockaddr_vm addr = {
		.svm_family = AF_VSOCK,
		.svm_cid = cid,
		.svm_port = port,
	};
	int sd;

	sd = socket(AF_VSOCK, SOCK_STREAM, 0);
	if (sd < 0)
		return -1;

	if (connect(sd, (struct sockaddr *)&addr, sizeof(addr)))
		return -1;

	return sd;
}

struct pids {
	struct pids		*next;
	int			pid;
};

struct trace_info {
	struct tracefs_instance		*instance;
	struct tep_handle		*tep;
	struct tep_event		*wake_up;
	struct tep_event		*kvm_exit;
	struct tep_format_field		*common_pid;
	struct tep_format_field		*wake_pid;
	struct pids			*pids;
	int				cid;
	int				pid;
};

static void tear_down_trace(struct trace_info *info)
{
	tracefs_instance_file_write(info->instance, "events/enable", "0");
	tracefs_instance_destroy(info->instance);
	tracefs_instance_free(info->instance);
	tep_free(info->tep);
}

static int setup_trace(struct trace_info *info)
{
	const char *systems[] = { "sched", "kvm", NULL};
	char *name;
	int ret;

	info->pids = NULL;

	ret = asprintf(&name, "vsock_find-%d\n", getpid());
	if (ret < 0)
		return ret;

	info->instance = tracefs_instance_create(name);
	free(name);
	if (!info->instance)
		return -1;

	tracefs_trace_off(info->instance);
	info->tep = tracefs_local_events_system(NULL, systems);
	if (!info->tep)
		goto fail;

	info->wake_up = tep_find_event_by_name(info->tep, "sched", "sched_waking");
	if (!info->wake_up) {
		fprintf(stderr, "Failed to find sched_waking\n");
		goto fail;
	}

	info->kvm_exit = tep_find_event_by_name(info->tep, "kvm", "kvm_exit");
	if (!info->kvm_exit) {
		fprintf(stderr, "Failed to find kvm_exit\n");
		goto fail;
	}

	info->wake_pid = tep_find_any_field(info->wake_up, "pid");
	if (!info->wake_pid) {
		fprintf(stderr, "Failed to find wake up pid\n");
		goto fail;
	}

	info->common_pid = tep_find_common_field(info->wake_up,
						 "common_pid");
	if (!info->common_pid) {
		fprintf(stderr, "Failed to find common pid\n");
		goto fail;
	}

	ret = tracefs_instance_file_write(info->instance, "events/sched/sched_waking/enable", "1");
	if (ret < 0) {
		fprintf(stderr, "Failed to enable sched_waking\n");
		goto fail;
	}

	ret = tracefs_instance_file_write(info->instance, "events/kvm/kvm_exit/enable", "1");
	if (ret < 0) {
		fprintf(stderr, "Failed to enable kvm_exit\n");
		goto fail;
	}

	return 0;
fail:
	tear_down_trace(info);
	return -1;
}


static void free_pids(struct pids *pids)
{
	struct pids *next;

	while (pids) {
		next = pids;
		pids = pids->next;
		free(next);
	}
}

static void add_pid(struct pids **pids, int pid)
{
	struct pids *new_pid;

	new_pid = malloc(sizeof(*new_pid));
	if (!new_pid)
		return;

	new_pid->pid = pid;
	new_pid->next = *pids;
	*pids = new_pid;
}

static bool match_pid(struct pids *pids, int pid)
{
	while (pids) {
		if (pids->pid == pid)
			return true;
		pids = pids->next;
	}
	return false;
}

static int callback(struct tep_event *event, struct tep_record *record,
		    int cpu, void *data)
{
	struct trace_info *info = data;
	struct tep_handle *tep = info->tep;
	unsigned long long val;
	int type;
	int pid;
	int ret;

	ret = tep_read_number_field(info->common_pid, record->data, &val);
	if (ret < 0)
		return 0;

	pid = val;

	if (!match_pid(info->pids, pid))
		return 0;

	type = tep_data_type(tep, record);
	if (type == info->kvm_exit->id) {
		info->pid = pid;
		return -1;
	}

	if (type != info->wake_up->id)
		return 0;

	ret = tep_read_number_field(info->wake_pid, record->data, &val);
	if (ret < 0)
		return 0;

	add_pid(&info->pids, (int)val);
	return 0;
}

static void print_cid_pid(int cid, int pid)
{
	FILE *fp;
	char *path;
	char *buf = NULL;
	char *save;
	size_t l = 0;
	int tgid = -1;

	if (asprintf(&path, "/proc/%d/status", pid) < 0)
		return;

	fp = fopen(path, "r");
	free(path);
	if (!fp)
		return;

	while (getline(&buf, &l, fp) > 0) {
		char *tok;

		if (strncmp(buf, "Tgid:", 5) != 0)
			continue;
		tok = strtok_r(buf, ":", &save);
		if (!tok)
			continue;
		tok = strtok_r(NULL, ":", &save);
		if (!tok)
			continue;
		while (isspace(*tok))
			tok++;
		tgid = strtol(tok, NULL, 0);
		break;
	}
	free(buf);

	if (tgid >= 0)
		printf("%d\t%d\n", cid, tgid);
}

static void find_cid(struct trace_info *info, int cid)
{
	int fd;

	add_pid(&info->pids, this_pid);

	tracefs_instance_file_clear(info->instance, "trace");
	tracefs_trace_on(info->instance);
	fd = open_vsock(cid, -1);
	tracefs_trace_off(info->instance);
	if (fd >= 0)
		close(fd);
	info->cid = cid;
	info->pid = -1;
	tracefs_iterate_raw_events(info->tep, info->instance,
				   NULL, 0, callback, info);
	if (info->pid >= 0)
		print_cid_pid(cid, info->pid);
	tracefs_trace_off(info->instance);
	free_pids(info->pids);
	info->pids = NULL;
}

static int find_cids(void)
{
	struct trace_info info ;
	int cid;

	if (setup_trace(&info) < 0)
		return -1;

	for (cid = 0; cid < MAX_CID; cid++)
		find_cid(&info, cid);

	tear_down_trace(&info);
	return 0;
}

int main(int argc, char *argv[])
{
	this_pid = getpid();
	find_cids();
	exit(0);
}
Stefano Garzarella May 7, 2021, 2:11 p.m. UTC | #2
Hi Steven,

On Wed, May 05, 2021 at 04:38:55PM -0400, Steven Rostedt wrote:
>The new trace-cmd 3.0 (which is almost ready to be released) allows for

>tracing between host and guests with timestamp synchronization such that

>the events on the host and the guest can be interleaved in the proper order

>that they occur. KernelShark now has a plugin that visualizes this

>interaction.

>

>The implementation requires that the guest has a vsock CID assigned, and on

>the guest a "trace-cmd agent" is running, that will listen on a port for

>the CID. The on the host a "trace-cmd record -A guest@cid:port -e events"

>can be called and the host will connect to the guest agent through the

>cid/port pair and have the agent enable tracing on behalf of the host and

>send the trace data back down to it.

>

>The problem is that there is no sure fire way to find the CID for a guest.

>Currently, the user must know the cid, or we have a hack that looks for the

>qemu process and parses the --guest-cid parameter from it. But this is

>prone to error and does not work on other implementation (was told that

>crosvm does not use qemu).


For debug I think could be useful to link the vhost-vsock kthread to the 
CID, but for the user point of view, maybe is better to query the VM 
management layer, for example if you're using libvirt, you can easily do:

$ virsh dumpxml fedora34 | grep cid
     <cid auto='yes' address='3'/>

>

>As I can not find a way to discover CIDs assigned to guests via any kernel

>interface, I decided to create this one. Note, I'm not attached to it. If

>there's a better way to do this, I would love to have it. But since I'm not

>an expert in the networking layer nor virtio, I decided to stick to what I

>know and add a debugfs interface that simply lists all the registered 

>CIDs

>and the worker task that they are associated with. The worker task at

>least has the PID of the task it represents.


I honestly don't know if it's the best interface, like I said maybe for 
debugging it's fine, but if we want to expose it to the user in some 
way, we could support devlink/netlink to provide information about the 
vsock devices currently in use.

>

>Now I can find the cid / host process in charge of the guest pair:

>

>  # cat /sys/kernel/debug/vsock_list

>  3	vhost-1954:2002

>

>  # ps aux | grep 1954

>  qemu        1954  9.9 21.3 1629092 796148 ?      Sl   16:22   0:58  /usr/bin/qemu-kvm -name guest=Fedora21,debug-threads=on -S -object secret,id=masterKey0,format=raw,file=/var/lib/libvirt/qemu/domain-1-Fedora21/master-key.aes -machine pc-1.2,accel=kvm,usb=off,dump-guest-core=off -cpu qemu64 -m 1000 -overcommit mem-lock=off -smp 2,sockets=2,cores=1,threads=1 -uuid 1eefeeb0-3ac7-07c1-926e-236908313b4c -no-user-config -nodefaults -chardev socket,id=charmonitor,fd=32,server,nowait -mon chardev=charmonitor,id=monitor,mode=control -rtc base=utc -no-shutdown -boot strict=on -device piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2 -device virtio-serial-pci,id=virtio-serial0,bus=pci.0,addr=0x6 -blockdev {"driver":"host_device","filename":"/dev/mapper/vg_bxtest-GuestFedora","node-name":"libvirt-1-storage","auto-read-only":true,"discard":"unmap"} -blockdev {"node-name":"libvirt-1-format","read-only":false,"driver":"raw","file":"libvirt-1-storage"} -device ide-hd,bus=ide.0,unit=0,drive=libvirt-1-

> format,id=ide0-0-0,bootindex=1 -netdev tap,fd=34,id=hostnet0 -device rtl8139,netdev=hostnet0,id=net0,mac=52:54:00:9f:e9:d5,bus=pci.0,addr=0x3 -netdev tap,fd=35,id=hostnet1 -device virtio-net-pci,netdev=hostnet1,id=net1,mac=52:54:00:ec:dc:6e,bus=pci.0,addr=0x5 -chardev pty,id=charserial0 -device isa-serial,chardev=charserial0,id=serial0 -chardev pipe,id=charchannel0,path=/var/lib/trace-cmd/virt/Fedora21/trace-pipe-cpu0 -device virtserialport,bus=virtio-serial0.0,nr=1,chardev=charchannel0,id=channel0,name=trace-pipe-cpu0 -chardev pipe,id=charchannel1,path=/var/lib/trace-cmd/virt/Fedora21/trace-pipe-cpu1 -device virtserialport,bus=virtio-serial0.0,nr=2,chardev=charchannel1,id=channel1,name=trace-pipe-cpu1 -vnc 127.0.0.1:0 -device cirrus-vga,id=video0,bus=pci.0,addr=0x2 -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x4 -sandbox on,obsolete=deny,elevateprivileges=deny,spawn=deny,resourcecontrol=deny -device vhost-vsock-pci,id=vsock0,guest-cid=3,vhostfd=16,bus=pci.0,addr=0x7 -msg

> timestamp=on

>  root        2000  0.0  0.0      0     0 ?        S    16:22   0:00 [kvm-pit/1954]

>  root        2002  0.0  0.0      0     0 ?        S    16:22   0:00 [vhost-1954]

>

>

>This is just an example of what I'm looking for. Just a way to find what

>process is using what cid.

>

>Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>

>---

>diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c

>index 5e78fb719602..4f03b25b23c1 100644

>--- a/drivers/vhost/vsock.c

>+++ b/drivers/vhost/vsock.c

>@@ -15,6 +15,7 @@

> #include <linux/virtio_vsock.h>

> #include <linux/vhost.h>

> #include <linux/hashtable.h>

>+#include <linux/debugfs.h>

>

> #include <net/af_vsock.h>

> #include "vhost.h"

>@@ -900,6 +901,128 @@ static struct miscdevice vhost_vsock_misc = {

> 	.fops = &vhost_vsock_fops,

> };

>

>+static struct dentry *vsock_file;

>+

>+struct vsock_file_iter {

>+	struct hlist_node	*node;

>+	int			index;

>+};

>+

>+

>+static void *vsock_next(struct seq_file *m, void *v, loff_t *pos)

>+{

>+	struct vsock_file_iter *iter = v;

>+	struct vhost_vsock *vsock;

>+

>+	if (pos)

>+		(*pos)++;

>+

>+	if (iter->index >= (int)HASH_SIZE(vhost_vsock_hash))

>+		return NULL;

>+

>+	if (iter->node)

>+		iter->node = rcu_dereference_raw(hlist_next_rcu(iter->node));

>+

>+	for (;;) {

>+		if (iter->node) {

>+			vsock = hlist_entry_safe(rcu_dereference_raw(iter->node),

>+						 struct vhost_vsock, hash);

>+			if (vsock->guest_cid)

>+				break;

>+			iter->node = rcu_dereference_raw(hlist_next_rcu(iter->node));

>+			continue;

>+		}

>+		iter->index++;

>+		if (iter->index >= HASH_SIZE(vhost_vsock_hash))

>+			return NULL;

>+

>+		iter->node = rcu_dereference_raw(hlist_first_rcu(&vhost_vsock_hash[iter->index]));

>+	}

>+	return iter;

>+}

>+

>+static void *vsock_start(struct seq_file *m, loff_t *pos)

>+{

>+	struct vsock_file_iter *iter = m->private;

>+	loff_t l = 0;

>+	void *t;

>+

>+	rcu_read_lock();


Instead of keeping this rcu lock between vsock_start() and vsock_stop(), 
maybe it's better to make a dump here of the bindings (pid/cid), save it 
in an array, and iterate it in vsock_next().

>+

>+	iter->index = -1;

>+	iter->node = NULL;

>+	t = vsock_next(m, iter, NULL);

>+

>+	for (; iter->index < HASH_SIZE(vhost_vsock_hash) && l < *pos;

>+	     t = vsock_next(m, iter, &l))

>+		;


A while() maybe was more readable...

Thanks,
Stefano
Steven Rostedt May 7, 2021, 2:40 p.m. UTC | #3
On Fri, 7 May 2021 16:11:20 +0200
Stefano Garzarella <sgarzare@redhat.com> wrote:

> Hi Steven,

> 

> On Wed, May 05, 2021 at 04:38:55PM -0400, Steven Rostedt wrote:

> >The new trace-cmd 3.0 (which is almost ready to be released) allows for

> >tracing between host and guests with timestamp synchronization such that

> >the events on the host and the guest can be interleaved in the proper order

> >that they occur. KernelShark now has a plugin that visualizes this

> >interaction.

> >

> >The implementation requires that the guest has a vsock CID assigned, and on

> >the guest a "trace-cmd agent" is running, that will listen on a port for

> >the CID. The on the host a "trace-cmd record -A guest@cid:port -e events"

> >can be called and the host will connect to the guest agent through the

> >cid/port pair and have the agent enable tracing on behalf of the host and

> >send the trace data back down to it.

> >

> >The problem is that there is no sure fire way to find the CID for a guest.

> >Currently, the user must know the cid, or we have a hack that looks for the

> >qemu process and parses the --guest-cid parameter from it. But this is

> >prone to error and does not work on other implementation (was told that

> >crosvm does not use qemu).  

> 

> For debug I think could be useful to link the vhost-vsock kthread to the 

> CID, but for the user point of view, maybe is better to query the VM 

> management layer, for example if you're using libvirt, you can easily do:

> 

> $ virsh dumpxml fedora34 | grep cid

>      <cid auto='yes' address='3'/>


We looked into going this route, but then that means trace-cmd host/guest
tracing needs a way to handle every layer, as some people use libvirt
(myself included), some people use straight qemu, some people us Xen, and
some people use crosvm. We need to support all of them. Which is why I'm
looking at doing this from the lowest common denominator, and since vsock
is a requirement from trace-cmd to do this tracing, getting the thread
that's related to the vsock is that lowest denominator.

> 

> >

> >As I can not find a way to discover CIDs assigned to guests via any kernel

> >interface, I decided to create this one. Note, I'm not attached to it. If

> >there's a better way to do this, I would love to have it. But since I'm not

> >an expert in the networking layer nor virtio, I decided to stick to what I

> >know and add a debugfs interface that simply lists all the registered 

> >CIDs

> >and the worker task that they are associated with. The worker task at

> >least has the PID of the task it represents.  

> 

> I honestly don't know if it's the best interface, like I said maybe for 

> debugging it's fine, but if we want to expose it to the user in some 

> way, we could support devlink/netlink to provide information about the 

> vsock devices currently in use.


Ideally, a devlink/netlink is the right approach. I just had no idea on how
to implement that ;-)  So I went with what I know, which is debugfs files!



> >Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>

> >---

> >diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c

> >index 5e78fb719602..4f03b25b23c1 100644

> >--- a/drivers/vhost/vsock.c

> >+++ b/drivers/vhost/vsock.c

> >@@ -15,6 +15,7 @@

> > #include <linux/virtio_vsock.h>

> > #include <linux/vhost.h>

> > #include <linux/hashtable.h>

> >+#include <linux/debugfs.h>

> >

> > #include <net/af_vsock.h>

> > #include "vhost.h"

> >@@ -900,6 +901,128 @@ static struct miscdevice vhost_vsock_misc = {

> > 	.fops = &vhost_vsock_fops,

> > };

> >

> >+static struct dentry *vsock_file;

> >+

> >+struct vsock_file_iter {

> >+	struct hlist_node	*node;

> >+	int			index;

> >+};

> >+

> >+

> >+static void *vsock_next(struct seq_file *m, void *v, loff_t *pos)

> >+{

> >+	struct vsock_file_iter *iter = v;

> >+	struct vhost_vsock *vsock;

> >+

> >+	if (pos)

> >+		(*pos)++;

> >+

> >+	if (iter->index >= (int)HASH_SIZE(vhost_vsock_hash))

> >+		return NULL;

> >+

> >+	if (iter->node)

> >+		iter->node = rcu_dereference_raw(hlist_next_rcu(iter->node));

> >+

> >+	for (;;) {

> >+		if (iter->node) {

> >+			vsock = hlist_entry_safe(rcu_dereference_raw(iter->node),

> >+						 struct vhost_vsock, hash);

> >+			if (vsock->guest_cid)

> >+				break;

> >+			iter->node = rcu_dereference_raw(hlist_next_rcu(iter->node));

> >+			continue;

> >+		}

> >+		iter->index++;

> >+		if (iter->index >= HASH_SIZE(vhost_vsock_hash))

> >+			return NULL;

> >+

> >+		iter->node = rcu_dereference_raw(hlist_first_rcu(&vhost_vsock_hash[iter->index]));

> >+	}

> >+	return iter;

> >+}

> >+

> >+static void *vsock_start(struct seq_file *m, loff_t *pos)

> >+{

> >+	struct vsock_file_iter *iter = m->private;

> >+	loff_t l = 0;

> >+	void *t;

> >+

> >+	rcu_read_lock();  

> 

> Instead of keeping this rcu lock between vsock_start() and vsock_stop(), 

> maybe it's better to make a dump here of the bindings (pid/cid), save it 

> in an array, and iterate it in vsock_next().


The start/stop of a seq_file() is made for taking locks. I do this with all
my code in ftrace. Yeah, there's a while loop between the two, but that's
just to fill the buffer. It's not that long and it never goes to userspace
between the two. You can even use this for spin locks (but I wouldn't
recommend doing it for raw ones).

> 

> >+

> >+	iter->index = -1;

> >+	iter->node = NULL;

> >+	t = vsock_next(m, iter, NULL);

> >+

> >+	for (; iter->index < HASH_SIZE(vhost_vsock_hash) && l < *pos;

> >+	     t = vsock_next(m, iter, &l))

> >+		;  

> 

> A while() maybe was more readable...


Again, I just cut and pasted from my other code.

If you have a good idea on how to implement this with netlink (something
that ss or netstat can dislpay), I think that's the best way to go.

Thanks for looking at this!

-- Steve
Stefano Garzarella May 7, 2021, 3:43 p.m. UTC | #4
On Fri, May 07, 2021 at 10:40:36AM -0400, Steven Rostedt wrote:
>On Fri, 7 May 2021 16:11:20 +0200

>Stefano Garzarella <sgarzare@redhat.com> wrote:

>

>> Hi Steven,

>>

>> On Wed, May 05, 2021 at 04:38:55PM -0400, Steven Rostedt wrote:

>> >The new trace-cmd 3.0 (which is almost ready to be released) allows for

>> >tracing between host and guests with timestamp synchronization such that

>> >the events on the host and the guest can be interleaved in the proper order

>> >that they occur. KernelShark now has a plugin that visualizes this

>> >interaction.

>> >

>> >The implementation requires that the guest has a vsock CID assigned, and on

>> >the guest a "trace-cmd agent" is running, that will listen on a port for

>> >the CID. The on the host a "trace-cmd record -A guest@cid:port -e events"

>> >can be called and the host will connect to the guest agent through the

>> >cid/port pair and have the agent enable tracing on behalf of the host and

>> >send the trace data back down to it.

>> >

>> >The problem is that there is no sure fire way to find the CID for a guest.

>> >Currently, the user must know the cid, or we have a hack that looks for the

>> >qemu process and parses the --guest-cid parameter from it. But this is

>> >prone to error and does not work on other implementation (was told that

>> >crosvm does not use qemu).

>>

>> For debug I think could be useful to link the vhost-vsock kthread to the

>> CID, but for the user point of view, maybe is better to query the VM

>> management layer, for example if you're using libvirt, you can easily do:

>>

>> $ virsh dumpxml fedora34 | grep cid

>>      <cid auto='yes' address='3'/>

>

>We looked into going this route, but then that means trace-cmd host/guest

>tracing needs a way to handle every layer, as some people use libvirt

>(myself included), some people use straight qemu, some people us Xen, and

>some people use crosvm. We need to support all of them. Which is why I'm

>looking at doing this from the lowest common denominator, and since vsock

>is a requirement from trace-cmd to do this tracing, getting the thread

>that's related to the vsock is that lowest denominator.


Makes sense.
Just a note, there are some VMMs, like Firecracker, Cloud Hypervisor, or 
QEMU with vhost-user-vsock, that don't use vhost-vsock in the host, but 
they implements an hybrid vsock over Unix Domain Socket:
https://github.com/firecracker-microvm/firecracker/blob/main/docs/vsock.md

So in that case this approach or netlink/devlink, would not work, but 
the application in the host can't use a vsock socket, so maybe isn't a 
problem.

>

>>

>> >

>> >As I can not find a way to discover CIDs assigned to guests via any kernel

>> >interface, I decided to create this one. Note, I'm not attached to it. If

>> >there's a better way to do this, I would love to have it. But since I'm not

>> >an expert in the networking layer nor virtio, I decided to stick to what I

>> >know and add a debugfs interface that simply lists all the 

>> >registered

>> >CIDs

>> >and the worker task that they are associated with. The worker task at

>> >least has the PID of the task it represents.

>>

>> I honestly don't know if it's the best interface, like I said maybe for

>> debugging it's fine, but if we want to expose it to the user in some

>> way, we could support devlink/netlink to provide information about the

>> vsock devices currently in use.

>

>Ideally, a devlink/netlink is the right approach. I just had no idea on how

>to implement that ;-)  So I went with what I know, which is debugfs files!

>

>

>

>> >Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>

>> >---

>> >diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c

>> >index 5e78fb719602..4f03b25b23c1 100644

>> >--- a/drivers/vhost/vsock.c

>> >+++ b/drivers/vhost/vsock.c

>> >@@ -15,6 +15,7 @@

>> > #include <linux/virtio_vsock.h>

>> > #include <linux/vhost.h>

>> > #include <linux/hashtable.h>

>> >+#include <linux/debugfs.h>

>> >

>> > #include <net/af_vsock.h>

>> > #include "vhost.h"

>> >@@ -900,6 +901,128 @@ static struct miscdevice vhost_vsock_misc = {

>> > 	.fops = &vhost_vsock_fops,

>> > };

>> >

>> >+static struct dentry *vsock_file;

>> >+

>> >+struct vsock_file_iter {

>> >+	struct hlist_node	*node;

>> >+	int			index;

>> >+};

>> >+

>> >+

>> >+static void *vsock_next(struct seq_file *m, void *v, loff_t *pos)

>> >+{

>> >+	struct vsock_file_iter *iter = v;

>> >+	struct vhost_vsock *vsock;

>> >+

>> >+	if (pos)

>> >+		(*pos)++;

>> >+

>> >+	if (iter->index >= (int)HASH_SIZE(vhost_vsock_hash))

>> >+		return NULL;

>> >+

>> >+	if (iter->node)

>> >+		iter->node = rcu_dereference_raw(hlist_next_rcu(iter->node));

>> >+

>> >+	for (;;) {

>> >+		if (iter->node) {

>> >+			vsock = hlist_entry_safe(rcu_dereference_raw(iter->node),

>> >+						 struct vhost_vsock, hash);

>> >+			if (vsock->guest_cid)

>> >+				break;

>> >+			iter->node = 

>> >rcu_dereference_raw(hlist_next_rcu(iter->node));

>> >+			continue;

>> >+		}

>> >+		iter->index++;

>> >+		if (iter->index >= HASH_SIZE(vhost_vsock_hash))

>> >+			return NULL;

>> >+

>> >+		iter->node = rcu_dereference_raw(hlist_first_rcu(&vhost_vsock_hash[iter->index]));

>> >+	}

>> >+	return iter;

>> >+}

>> >+

>> >+static void *vsock_start(struct seq_file *m, loff_t *pos)

>> >+{

>> >+	struct vsock_file_iter *iter = m->private;

>> >+	loff_t l = 0;

>> >+	void *t;

>> >+

>> >+	rcu_read_lock();

>>

>> Instead of keeping this rcu lock between vsock_start() and vsock_stop(),

>> maybe it's better to make a dump here of the bindings (pid/cid), save it

>> in an array, and iterate it in vsock_next().

>

>The start/stop of a seq_file() is made for taking locks. I do this with all

>my code in ftrace. Yeah, there's a while loop between the two, but that's

>just to fill the buffer. It's not that long and it never goes to userspace

>between the two. You can even use this for spin locks (but I wouldn't

>recommend doing it for raw ones).


Ah okay, thanks for the clarification!

I was worried because building with `make C=2` I had these warnings:

../drivers/vhost/vsock.c:944:13: warning: context imbalance in 'vsock_start' - wrong count at exit
../drivers/vhost/vsock.c:963:13: warning: context imbalance in 'vsock_stop' - unexpected unlock

Maybe we need to annotate the functions somehow.

>

>>

>> >+

>> >+	iter->index = -1;

>> >+	iter->node = NULL;

>> >+	t = vsock_next(m, iter, NULL);

>> >+

>> >+	for (; iter->index < HASH_SIZE(vhost_vsock_hash) && l < *pos;

>> >+	     t = vsock_next(m, iter, &l))

>> >+		;

>>

>> A while() maybe was more readable...

>

>Again, I just cut and pasted from my other code.

>

>If you have a good idea on how to implement this with netlink (something

>that ss or netstat can dislpay), I think that's the best way to go.


Okay, I'll take a look and get back to you.
If it's too complicated, we can go ahead with this patch.

Thanks,
Stefano
Steven Rostedt May 7, 2021, 4:09 p.m. UTC | #5
On Fri, 7 May 2021 17:43:32 +0200
Stefano Garzarella <sgarzare@redhat.com> wrote:

> >The start/stop of a seq_file() is made for taking locks. I do this with all

> >my code in ftrace. Yeah, there's a while loop between the two, but that's

> >just to fill the buffer. It's not that long and it never goes to userspace

> >between the two. You can even use this for spin locks (but I wouldn't

> >recommend doing it for raw ones).  

> 

> Ah okay, thanks for the clarification!

> 

> I was worried because building with `make C=2` I had these warnings:

> 

> ../drivers/vhost/vsock.c:944:13: warning: context imbalance in 'vsock_start' - wrong count at exit

> ../drivers/vhost/vsock.c:963:13: warning: context imbalance in 'vsock_stop' - unexpected unlock

> 

> Maybe we need to annotate the functions somehow.


Yep, I it should have been.

static void *vsock_start(struct seq_file *m, loff_t *pos)
	__acquires(rcu)
{
	[...]

}

static void vsock_stop(struct seq_file *m, void *p)
	__releases(rcu)
{
	[...]
}

static int vsock_show(struct seq_file *m, void *v)
	__must_hold(rcu)
{
	[...]
}


And guess what? I just copied those annotations from sock_hash_seq_start(),
sock_hash_seq_show() and sock_hash_seq_stop() from net/core/sock_map.c
which is doing exactly the same thing ;-)

So there's definitely precedence for this.

> 

> >  

> >>  

> >> >+

> >> >+	iter->index = -1;

> >> >+	iter->node = NULL;

> >> >+	t = vsock_next(m, iter, NULL);

> >> >+

> >> >+	for (; iter->index < HASH_SIZE(vhost_vsock_hash) && l < *pos;

> >> >+	     t = vsock_next(m, iter, &l))

> >> >+		;  

> >>

> >> A while() maybe was more readable...  

> >

> >Again, I just cut and pasted from my other code.

> >

> >If you have a good idea on how to implement this with netlink (something

> >that ss or netstat can dislpay), I think that's the best way to go.  

> 

> Okay, I'll take a look and get back to you.

> If it's too complicated, we can go ahead with this patch.


Awesome, thanks!

-- Steve
Mike Christie May 8, 2021, 6:32 p.m. UTC | #6
On 5/5/21 3:38 PM, Steven Rostedt wrote:
> The new trace-cmd 3.0 (which is almost ready to be released) allows for

> tracing between host and guests with timestamp synchronization such that

> the events on the host and the guest can be interleaved in the proper order

> that they occur. KernelShark now has a plugin that visualizes this

> interaction.

> 

> The implementation requires that the guest has a vsock CID assigned, and on

> the guest a "trace-cmd agent" is running, that will listen on a port for

> the CID. The on the host a "trace-cmd record -A guest@cid:port -e events"

> can be called and the host will connect to the guest agent through the

> cid/port pair and have the agent enable tracing on behalf of the host and

> send the trace data back down to it.

> 

> The problem is that there is no sure fire way to find the CID for a guest.

> Currently, the user must know the cid, or we have a hack that looks for the

> qemu process and parses the --guest-cid parameter from it. But this is

> prone to error and does not work on other implementation (was told that

> crosvm does not use qemu).

> 

> As I can not find a way to discover CIDs assigned to guests via any kernel

> interface, I decided to create this one. Note, I'm not attached to it. If

> there's a better way to do this, I would love to have it. But since I'm not

> an expert in the networking layer nor virtio, I decided to stick to what I

> know and add a debugfs interface that simply lists all the registered CIDs

> and the worker task that they are associated with. The worker task at

> least has the PID of the task it represents.

> 

> Now I can find the cid / host process in charge of the guest pair:

> 

>   # cat /sys/kernel/debug/vsock_list

>   3	vhost-1954:2002

> 


I think I need the same thing for vhost-scsi. We want to know a vhost-scsi
devs worker thread's pid. If we use multiple vhost-devs in one VM then we
wanted to be able to know which thread goes with which dev.

For the vhost thread patches I added an ioctl:

https://lists.linuxfoundation.org/pipermail/virtualization/2021-April/054014.html

but I had originally implemented it in sysfs. For sysfs we can add a struct
device in the vhost_dev and struct deice in the vhost_virtqueue. We then
have a 2 new classes /sys/class/vhost_device and vhost_virtqueue with the
vhost_device device the parent of vhost_virtqueue device.

The nice thing is that it's a common interface and works for every vhost_dev
and all their virtqueues. It works for non libvirt users.

The drawback is adding in refcounts/releases and that type of code for the
vhost_dev and vhost_virtqueue. Also I'm not sure about security.

Note that I'm not tied to sysfs. netlink would be fine. I just need any
interface.
Stefan Hajnoczi May 13, 2021, 3:57 p.m. UTC | #7
On Wed, May 05, 2021 at 04:38:55PM -0400, Steven Rostedt wrote:
> The new trace-cmd 3.0 (which is almost ready to be released) allows for

> tracing between host and guests with timestamp synchronization such that

> the events on the host and the guest can be interleaved in the proper order

> that they occur. KernelShark now has a plugin that visualizes this

> interaction.

> 

> The implementation requires that the guest has a vsock CID assigned, and on

> the guest a "trace-cmd agent" is running, that will listen on a port for

> the CID. The on the host a "trace-cmd record -A guest@cid:port -e events"

> can be called and the host will connect to the guest agent through the

> cid/port pair and have the agent enable tracing on behalf of the host and

> send the trace data back down to it.

> 

> The problem is that there is no sure fire way to find the CID for a guest.

> Currently, the user must know the cid, or we have a hack that looks for the

> qemu process and parses the --guest-cid parameter from it. But this is

> prone to error and does not work on other implementation (was told that

> crosvm does not use qemu).


The crosvm command-line syntax is: crosvm run --cid <CID>

> As I can not find a way to discover CIDs assigned to guests via any kernel

> interface, I decided to create this one. Note, I'm not attached to it. If

> there's a better way to do this, I would love to have it. But since I'm not

> an expert in the networking layer nor virtio, I decided to stick to what I

> know and add a debugfs interface that simply lists all the registered CIDs

> and the worker task that they are associated with. The worker task at

> least has the PID of the task it represents.

> 

> Now I can find the cid / host process in charge of the guest pair:

> 

>   # cat /sys/kernel/debug/vsock_list

>   3	vhost-1954:2002

> 

>   # ps aux | grep 1954

>   qemu        1954  9.9 21.3 1629092 796148 ?      Sl   16:22   0:58  /usr/bin/qemu-kvm -name guest=Fedora21,debug-threads=on -S -object secret,id=masterKey0,format=raw,file=/var/lib/libvirt/qemu/domain-1-Fedora21/master-key.aes -machine pc-1.2,accel=kvm,usb=off,dump-guest-core=off -cpu qemu64 -m 1000 -overcommit mem-lock=off -smp 2,sockets=2,cores=1,threads=1 -uuid 1eefeeb0-3ac7-07c1-926e-236908313b4c -no-user-config -nodefaults -chardev socket,id=charmonitor,fd=32,server,nowait -mon chardev=charmonitor,id=monitor,mode=control -rtc base=utc -no-shutdown -boot strict=on -device piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2 -device virtio-serial-pci,id=virtio-serial0,bus=pci.0,addr=0x6 -blockdev {"driver":"host_device","filename":"/dev/mapper/vg_bxtest-GuestFedora","node-name":"libvirt-1-storage","auto-read-only":true,"discard":"unmap"} -blockdev {"node-name":"libvirt-1-format","read-only":false,"driver":"raw","file":"libvirt-1-storage"} -device ide-hd,bus=ide.0,unit=0,drive=libvirt-1-

>  format,id=ide0-0-0,bootindex=1 -netdev tap,fd=34,id=hostnet0 -device rtl8139,netdev=hostnet0,id=net0,mac=52:54:00:9f:e9:d5,bus=pci.0,addr=0x3 -netdev tap,fd=35,id=hostnet1 -device virtio-net-pci,netdev=hostnet1,id=net1,mac=52:54:00:ec:dc:6e,bus=pci.0,addr=0x5 -chardev pty,id=charserial0 -device isa-serial,chardev=charserial0,id=serial0 -chardev pipe,id=charchannel0,path=/var/lib/trace-cmd/virt/Fedora21/trace-pipe-cpu0 -device virtserialport,bus=virtio-serial0.0,nr=1,chardev=charchannel0,id=channel0,name=trace-pipe-cpu0 -chardev pipe,id=charchannel1,path=/var/lib/trace-cmd/virt/Fedora21/trace-pipe-cpu1 -device virtserialport,bus=virtio-serial0.0,nr=2,chardev=charchannel1,id=channel1,name=trace-pipe-cpu1 -vnc 127.0.0.1:0 -device cirrus-vga,id=video0,bus=pci.0,addr=0x2 -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x4 -sandbox on,obsolete=deny,elevateprivileges=deny,spawn=deny,resourcecontrol=deny -device vhost-vsock-pci,id=vsock0,guest-cid=3,vhostfd=16,bus=pci.0,addr=0x7 -msg 

>  timestamp=on

>   root        2000  0.0  0.0      0     0 ?        S    16:22   0:00 [kvm-pit/1954]

>   root        2002  0.0  0.0      0     0 ?        S    16:22   0:00 [vhost-1954]


This approach relies on process hierarchy of the VMM (QEMU).
Multi-process QEMU is in development and will allow VIRTIO devices to
run as separate processes from the main QEMU. It then becomes harder to
correlate a VIRTIO device process with its QEMU process.

So I think in the end this approach ends up being as fragile as parsing
command-lines. The kernel doesn't really have the concept of a "VM" that
the vhost_vsock is associated with :). Maybe just parse QEMU and crosvm
command-lines?

Stefan
Steven Rostedt May 13, 2021, 4:08 p.m. UTC | #8
On Thu, 13 May 2021 16:57:34 +0100
Stefan Hajnoczi <stefanha@redhat.com> wrote:


> This approach relies on process hierarchy of the VMM (QEMU).

> Multi-process QEMU is in development and will allow VIRTIO devices to

> run as separate processes from the main QEMU. It then becomes harder to

> correlate a VIRTIO device process with its QEMU process.


And we need to know all these mapping regardless, as we need to map each
thread / process to the vCPU in order to correlate between host thread and
vCPU thread for showing in KernelShark.

Thus this mapping to find the main thread/process needs to be done
regardless.

> 

> So I think in the end this approach ends up being as fragile as parsing

> command-lines. The kernel doesn't really have the concept of a "VM" that

> the vhost_vsock is associated with :). Maybe just parse QEMU and crosvm

> command-lines?

>


That's what we do now, and it already broke once, and even parsing the
command line wont be enough for the stated reasons above.

-- Steve
diff mbox series

Patch

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 5e78fb719602..4f03b25b23c1 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -15,6 +15,7 @@ 
 #include <linux/virtio_vsock.h>
 #include <linux/vhost.h>
 #include <linux/hashtable.h>
+#include <linux/debugfs.h>
 
 #include <net/af_vsock.h>
 #include "vhost.h"
@@ -900,6 +901,128 @@  static struct miscdevice vhost_vsock_misc = {
 	.fops = &vhost_vsock_fops,
 };
 
+static struct dentry *vsock_file;
+
+struct vsock_file_iter {
+	struct hlist_node	*node;
+	int			index;
+};
+
+
+static void *vsock_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct vsock_file_iter *iter = v;
+	struct vhost_vsock *vsock;
+
+	if (pos)
+		(*pos)++;
+
+	if (iter->index >= (int)HASH_SIZE(vhost_vsock_hash))
+		return NULL;
+
+	if (iter->node)
+		iter->node = rcu_dereference_raw(hlist_next_rcu(iter->node));
+
+	for (;;) {
+		if (iter->node) {
+			vsock = hlist_entry_safe(rcu_dereference_raw(iter->node),
+						 struct vhost_vsock, hash);
+			if (vsock->guest_cid)
+				break;
+			iter->node = rcu_dereference_raw(hlist_next_rcu(iter->node));
+			continue;
+		}
+		iter->index++;
+		if (iter->index >= HASH_SIZE(vhost_vsock_hash))
+			return NULL;
+
+		iter->node = rcu_dereference_raw(hlist_first_rcu(&vhost_vsock_hash[iter->index]));
+	}
+	return iter;
+}
+
+static void *vsock_start(struct seq_file *m, loff_t *pos)
+{
+	struct vsock_file_iter *iter = m->private;
+	loff_t l = 0;
+	void *t;
+
+	rcu_read_lock();
+
+	iter->index = -1;
+	iter->node = NULL;
+	t = vsock_next(m, iter, NULL);
+
+	for (; iter->index < HASH_SIZE(vhost_vsock_hash) && l < *pos;
+	     t = vsock_next(m, iter, &l))
+		;
+
+	return t;
+}
+
+static void vsock_stop(struct seq_file *m, void *p)
+{
+	rcu_read_unlock();
+}
+
+static int vsock_show(struct seq_file *m, void *v)
+{
+	struct vsock_file_iter *iter = v;
+	struct vhost_vsock *vsock;
+	struct task_struct *worker;
+
+	if (!iter || iter->index >= HASH_SIZE(vhost_vsock_hash))
+		return 0;
+
+	vsock = hlist_entry_safe(rcu_dereference_raw(iter->node), struct vhost_vsock, hash);
+	worker = vsock->dev.worker;
+	seq_printf(m, "%d\t", vsock->guest_cid);
+
+	if (worker)
+		seq_printf(m, "%s:%d\n", worker->comm, worker->pid);
+	else
+		seq_puts(m, "(no task)\n");
+
+	return 0;
+}
+
+static const struct seq_operations vsock_file_seq_ops = {
+	.start		= vsock_start,
+	.next		= vsock_next,
+	.stop		= vsock_stop,
+	.show		= vsock_show,
+};
+
+static int vsock_file_open(struct inode *inode, struct file *file)
+{
+	struct vsock_file_iter *iter;
+	struct seq_file *m;
+	int ret;
+
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+	if (!iter)
+		return -ENOMEM;
+
+	ret = seq_open(file, &vsock_file_seq_ops);
+	if (ret) {
+		kfree(iter);
+		return ret;
+	}
+
+	m = file->private_data;
+	m->private = iter;
+
+	return 0;
+}
+
+static const struct file_operations vsock_file_fops = {
+	.owner		= THIS_MODULE,
+	.open		= vsock_file_open,
+	.release	= seq_release_private,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+};
+
 static int __init vhost_vsock_init(void)
 {
 	int ret;
@@ -908,12 +1031,15 @@  static int __init vhost_vsock_init(void)
 				  VSOCK_TRANSPORT_F_H2G);
 	if (ret < 0)
 		return ret;
+	vsock_file = debugfs_create_file("vsock_list", 0400,
+					 NULL, NULL, &vsock_file_fops);
 	return misc_register(&vhost_vsock_misc);
 };
 
 static void __exit vhost_vsock_exit(void)
 {
 	misc_deregister(&vhost_vsock_misc);
+	debugfs_remove(vsock_file);
 	vsock_core_unregister(&vhost_transport.transport);
 };