diff mbox series

[v8,17/20] multi-process: heartbeat messages to remote

Message ID 93b7566e5d565b9e5d8127849bb5be65057e25cc.1596217462.git.jag.raman@oracle.com
State New
Headers show
Series None | expand

Commit Message

Jag Raman July 31, 2020, 6:20 p.m. UTC
From: Elena Ufimtseva <elena.ufimtseva@oracle.com>

In order to detect remote processes which are hung, the
proxy periodically sends heartbeat messages to confirm if
the remote process is alive. The remote process responds
to this heartbeat message to confirm it is alive.

Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
---
 hw/i386/remote-msg.c     | 19 ++++++++++++++++++
 hw/pci/proxy.c           | 52 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/hw/pci/proxy.h   |  2 ++
 include/io/mpqemu-link.h |  1 +
 io/mpqemu-link.c         |  8 ++++++++
 5 files changed, 82 insertions(+)
diff mbox series

Patch

diff --git a/hw/i386/remote-msg.c b/hw/i386/remote-msg.c
index 756b710..2a4d7f1 100644
--- a/hw/i386/remote-msg.c
+++ b/hw/i386/remote-msg.c
@@ -26,6 +26,7 @@  static void process_config_read(QIOChannel *ioc, PCIDevice *dev,
                                 MPQemuMsg *msg);
 static void process_bar_write(QIOChannel *ioc, MPQemuMsg *msg, Error **errp);
 static void process_bar_read(QIOChannel *ioc, MPQemuMsg *msg, Error **errp);
+static void process_proxy_ping_msg(QIOChannel *ioc, Error **errp);
 
 gboolean mpqemu_process_msg(QIOChannel *ioc, GIOCondition cond,
                             gpointer opaque)
@@ -75,6 +76,9 @@  gboolean mpqemu_process_msg(QIOChannel *ioc, GIOCondition cond,
     case SET_IRQFD:
         process_set_irqfd_msg(pci_dev, &msg);
         break;
+    case PROXY_PING:
+        process_proxy_ping_msg(ioc, &local_err);
+        break;
     default:
         error_setg(&local_err,
                    "Unknown command (%d) received for device %s (pid=%d)",
@@ -223,3 +227,18 @@  fail:
                    "in remote process pid=%d", getpid());
     }
 }
+
+static void process_proxy_ping_msg(QIOChannel *ioc, Error **errp)
+{
+    MPQemuMsg ret = { 0 };
+    Error *local_err = NULL;
+
+    ret.cmd = RET_MSG;
+    ret.size = sizeof(ret.data1);
+
+    mpqemu_msg_send(&ret, ioc, &local_err);
+    if (local_err) {
+        error_setg(errp, "Error while sending message to proxy "
+                   "in remote process pid=%d", getpid());
+    }
+}
diff --git a/hw/pci/proxy.c b/hw/pci/proxy.c
index 50a806c..490093c 100644
--- a/hw/pci/proxy.c
+++ b/hw/pci/proxy.c
@@ -24,6 +24,8 @@ 
 #include "util/event_notifier-posix.c"
 
 static void probe_pci_info(PCIDevice *dev, Error **errp);
+static void start_hb_timer(PCIProxyDev *dev);
+static void stop_hb_timer(PCIProxyDev *dev);
 
 static void proxy_set_socket(PCIProxyDev *pdev, int fd, Error **errp)
 {
@@ -111,6 +113,8 @@  static void pci_proxy_dev_realize(PCIDevice *device, Error **errp)
     setup_irqfd(dev);
 
     probe_pci_info(PCI_DEVICE(dev), errp);
+
+    start_hb_timer(dev);
 }
 
 static void pci_proxy_dev_exit(PCIDevice *pdev)
@@ -123,6 +127,8 @@  static void pci_proxy_dev_exit(PCIDevice *pdev)
 
     event_notifier_cleanup(&dev->intr);
     event_notifier_cleanup(&dev->resample);
+
+    stop_hb_timer(dev);
 }
 
 static int config_op_send(PCIProxyDev *pdev, uint32_t addr, uint32_t *val,
@@ -343,3 +349,49 @@  static void probe_pci_info(PCIDevice *dev, Error **errp)
         }
     }
 }
+
+static void hb_msg(PCIProxyDev *dev)
+{
+    DeviceState *ds = DEVICE(dev);
+    Error *local_err = NULL;
+    MPQemuMsg msg = { 0 };
+
+    msg.cmd = PROXY_PING;
+    msg.bytestream = 0;
+    msg.size = 0;
+
+    (void)mpqemu_msg_send_and_await_reply(&msg, dev->ioc, &local_err);
+    if (local_err) {
+        error_report_err(local_err);
+        qio_channel_close(dev->ioc, &local_err);
+        error_setg(&error_fatal, "Lost contact with device %s", ds->id);
+    }
+}
+
+#define NOP_INTERVAL 1000
+
+static void remote_ping(void *opaque)
+{
+    PCIProxyDev *dev = opaque;
+
+    hb_msg(dev);
+
+    timer_mod(dev->hb_timer,
+              qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + NOP_INTERVAL);
+}
+
+static void start_hb_timer(PCIProxyDev *dev)
+{
+    dev->hb_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
+                                 remote_ping,
+                                 dev);
+
+    timer_mod(dev->hb_timer,
+              qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + NOP_INTERVAL);
+}
+
+static void stop_hb_timer(PCIProxyDev *dev)
+{
+    timer_del(dev->hb_timer);
+    timer_free(dev->hb_timer);
+}
diff --git a/include/hw/pci/proxy.h b/include/hw/pci/proxy.h
index 15cc381..d784328 100644
--- a/include/hw/pci/proxy.h
+++ b/include/hw/pci/proxy.h
@@ -40,6 +40,8 @@  struct PCIProxyDev {
     EventNotifier intr;
     EventNotifier resample;
 
+    QEMUTimer *hb_timer;
+
     ProxyMemoryRegion region[PCI_NUM_REGIONS];
 };
 
diff --git a/include/io/mpqemu-link.h b/include/io/mpqemu-link.h
index a3844e1..5f2913f 100644
--- a/include/io/mpqemu-link.h
+++ b/include/io/mpqemu-link.h
@@ -40,6 +40,7 @@  typedef enum {
     BAR_READ,
     SET_IRQFD,
     GET_PCI_INFO,
+    PROXY_PING,
     MAX = INT_MAX,
 } MPQemuCmd;
 
diff --git a/io/mpqemu-link.c b/io/mpqemu-link.c
index 6fa4665..0222e81 100644
--- a/io/mpqemu-link.c
+++ b/io/mpqemu-link.c
@@ -280,6 +280,14 @@  bool mpqemu_msg_valid(MPQemuMsg *msg)
             return false;
         }
         break;
+    case PROXY_PING:
+        if (msg->bytestream || msg->num_fds) {
+            return false;
+        }
+        if (msg->size) {
+            return false;
+        }
+        break;
     default:
         break;
     }