@@ -108,6 +108,7 @@ noinst_HEADERS = \
${srcdir}/include/odp_packet_socket.h \
${srcdir}/include/odp_packet_tap.h \
${srcdir}/include/odp_pci_internal.h \
+ ${srcdir}/include/odp_pci_vfio_internal.h \
${srcdir}/include/odp_pkt_queue_internal.h \
${srcdir}/include/odp_pool_internal.h \
${srcdir}/include/odp_queue_internal.h \
@@ -138,6 +139,7 @@ __LIB__libodp_la_SOURCES = \
odp_packet.c \
odp_packet_flags.c \
odp_packet_io.c \
+ odp_pci_vfio.c \
pktio/io_ops.c \
pktio/pktio_common.c \
pktio/loop.c \
new file mode 100644
@@ -0,0 +1,92 @@
+/* Copyright (c) 2015, Linaro Limited
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+/**
+ * @file
+ *
+ * ODP pci vfio - implementation internal
+ */
+
+#ifndef ODP_PCI_VFIO_INTERNAL_H_
+#define ODP_PCI_VFIO_INTERNAL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <odp/dma.h>
+#include <odp_dma_internal.h>
+#include <odp/pci.h>
+#include <odp_pci_internal.h>
+
+/**
+ * Initialise and map the PCI interface for the given pci address,
+ *
+ * @param pci_addr pci address to the board that should be initialized and
+ * mapped, i.e. something like: "0000:23:00.0"
+ *
+ * @returns 0 a pci device on success, NULL on error.
+ */
+int _odp_pci_vfio_init(const char *pci_addr, pci_dev_t *dev);
+
+/**
+ * Release a PCI device
+ *
+ * @param pci_dev The PCI device to be realeased as returned by
+ * _odp_pci_vfio_init()
+ */
+void _odp_pci_vfio_release(pci_dev_t *dev);
+
+/**
+ * map a DMA region fragment.
+ *
+ * @param dev the pci device as returned by odp_pci_init
+ * @param region the DMA region descriptor.
+ *
+ * @returns 0 on success or a negative value on error
+ */
+int _odp_pci_vfio_map_dma_region_f(pci_dev_t *dev, dma_map_t *region);
+
+/**
+ * unmap a DMA region fragment.
+ *
+ * @param dev the pci device as returned by odp_pci_init
+ * @param region the DMA region descriptor.
+ *
+ * @returns 0 on success or a negative value on error
+ */
+int _odp_pci_vfio_unmap_dma_region_f(pci_dev_t *dev, dma_map_t *region);
+
+/**
+ * Read the PCI config area
+ *
+ * @param dev the pci device as returned by odp_pci_init
+ * @param buf where to write the read data
+ * @param len the number of bytes to read
+ * @param offs the offset where read should start
+ *
+ * @returns the number of read bytes
+ */
+int _odp_pci_vfio_read_config(pci_dev_t *dev, void *buf, size_t len, int offs);
+
+/**
+ * Write the PCI config area
+ *
+ * @param dev the pci device as returned by odp_pci_init
+ * @param buf where to take the data to be written
+ * @param len the number of bytes to written
+ * @param offs the offset where data should be written
+ *
+ * @returns the number of written bytes
+ */
+int _odp_pci_vfio_write_config(pci_dev_t *dev, const void *buf,
+ size_t len, int offs);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ODP_PCI_VFIO_INTERNAL_H_ */
new file mode 100644
@@ -0,0 +1,873 @@
+/* Copyright (c) 2015, Linaro Limited
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <linux/vfio.h>
+#include <linux/limits.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <linux/pci_regs.h>
+#include <unistd.h>
+#include <odp/pci.h>
+#include <odp/shared_memory.h>
+#include <odp_debug_internal.h>
+#include <odp_pci_vfio_internal.h>
+#include <odp_align_internal.h>
+#include <odp/spinlock.h>
+
+#define VFIO_CONTAINER_PATH "/dev/vfio/vfio"
+#define SYSFS_PCI_DEVICES "/sys/bus/pci/devices"
+#define VFIO_GROUP_FMT "/dev/vfio/%u"
+/* The vfio-pci driver harcodes map region n at address n<<40: */
+/* see VFIO_PCI_OFFSET_SHIFT in linux */
+#define VFIO_GET_REGION_ADDR(x) ((uint64_t)x << 40ULL)
+
+#define DIM(a) (sizeof(a) / sizeof((a)[0]))
+
+#define MAX_PCI_VFIO_GRP 5
+#define MAX_PCI_VFIO_DEVICES 8
+#define MAX_PCI_VFIO_DMA_FRAGS 50
+
+/*
+ * pci_vfio uses file descriptors to access differents objects.
+ * Access to data structures from different linux threads are protected
+ * by the following spinlock
+ */
+odp_spinlock_t vfio_thread_spinlock;
+
+/**
+ * device implementation data for the VFIO implementation of the PCI interface
+ */
+typedef struct vfio_t {
+ int iommu_group_no;
+ int container_fd;
+ int group_fd;
+ int dev_fd;
+ int config_fd;
+} vfio_t;
+
+/**
+ * data common to all pci vfio instances of this linux thread
+ */
+typedef struct pci_vfio_data_t {
+ int container_fd; /* odp uses one container */
+ int container_ref_count;
+ int iommu_type; /* iommu type set for this container */
+ dma_map_t *dma_maps[MAX_PCI_VFIO_DMA_FRAGS]; /* mapped regions */
+ const char *dev_names[MAX_PCI_VFIO_DEVICES]; /*initialised devices */
+ struct {
+ int group_no;
+ int group_fd;
+ int refcount;
+ } groups[MAX_PCI_VFIO_GRP];
+} pci_vfio_data_t;
+
+static pci_vfio_data_t pci_vfio_data;
+
+/* set PCI bus mastering */
+static int pci_vfio_set_bus_master(int dev_fd)
+{
+ uint16_t reg;
+ int ret;
+
+ ret = pread64(dev_fd, ®, sizeof(reg),
+ VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+ PCI_COMMAND);
+ if (ret != sizeof(reg)) {
+ ODP_ERR("Cannot read command from PCI config space!\n");
+ return -1;
+ }
+
+ /* set the master bit */
+ reg |= PCI_COMMAND_MASTER;
+
+ ret = pwrite64(dev_fd, ®, sizeof(reg),
+ VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+ PCI_COMMAND);
+
+ if (ret != sizeof(reg)) {
+ ODP_ERR("Cannot write command to PCI config space!\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * read pci device configuration space
+ */
+int _odp_pci_vfio_read_config(pci_dev_t *dev, void *buf, size_t len, int offs)
+{
+ vfio_t *vfio = (vfio_t *)dev->implementation_data;
+
+ return pread64(vfio->dev_fd, buf, len,
+ VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX)
+ + offs);
+}
+
+/*
+ * write pci device configuration space
+ */
+int _odp_pci_vfio_write_config(pci_dev_t *dev,
+ const void *buf, size_t len, int offs)
+{
+ vfio_t *vfio = (vfio_t *)dev->implementation_data;
+
+ return pwrite64(vfio->dev_fd, buf, len,
+ VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX)
+ + offs);
+}
+
+/*
+ * split string into tokens
+ */
+static int strsplit(char *string, int stringlen,
+ char **tokens, int maxtokens, char delim)
+{
+ int i, tok = 0;
+ int tokstart = 1; /* first token is right at start of string */
+
+ if (string == NULL || tokens == NULL)
+ goto einval_error;
+
+ for (i = 0; i < stringlen; i++) {
+ if (string[i] == '\0' || tok >= maxtokens)
+ break;
+ if (tokstart) {
+ tokstart = 0;
+ tokens[tok++] = &string[i];
+ }
+ if (string[i] == delim) {
+ string[i] = '\0';
+ tokstart = 1;
+ }
+ }
+ return tok;
+
+einval_error:
+ return -1;
+}
+
+/**
+ * initialize linux thread common data
+ */
+static void pci_vfio_data_init(void)
+{
+ int i;
+
+ odp_spinlock_lock(&vfio_thread_spinlock);
+
+ pci_vfio_data.container_fd = -1;
+ pci_vfio_data.container_ref_count = 0;
+ pci_vfio_data.iommu_type = -1;
+ for (i = 0; i < MAX_PCI_VFIO_GRP; i++) {
+ pci_vfio_data.groups[i].group_no = -1;
+ pci_vfio_data.groups[i].group_fd = -1;
+ pci_vfio_data.groups[i].refcount = 0;
+ }
+ /* set all DMA fragments as free for this container: */
+ for (i = 0; i < MAX_PCI_VFIO_DMA_FRAGS; i++)
+ pci_vfio_data.dma_maps[i] = NULL;
+
+ odp_spinlock_unlock(&vfio_thread_spinlock);
+}
+
+/*
+ * open a vfio container.
+ * return: the container file descriptor (fd) or -1 on error
+ */
+static int pci_vfio_get_container_fd(void)
+{
+ int ret, vfio_container_fd;
+
+ /* initialise common vfio area */
+ odp_spinlock_lock(&vfio_thread_spinlock);
+
+ /* if the file descriptor for the container already exists, return it */
+ if (pci_vfio_data.container_fd != -1) {
+ pci_vfio_data.container_ref_count++;
+ odp_spinlock_unlock(&vfio_thread_spinlock);
+ return pci_vfio_data.container_fd;
+ }
+
+ /* try to open the vfio container: */
+ vfio_container_fd = open(VFIO_CONTAINER_PATH, O_RDWR);
+ if (vfio_container_fd < 0) {
+ ODP_ERR("cannot open VFIO container, "
+ "error %i (%s)\n", errno, strerror(errno));
+ odp_spinlock_unlock(&vfio_thread_spinlock);
+ return -1;
+ }
+
+ /* check the VFIO API version */
+ ret = ioctl(vfio_container_fd, VFIO_GET_API_VERSION);
+ if (ret != VFIO_API_VERSION) {
+ if (ret < 0)
+ ODP_ERR("couldn't get VFIO API version, err=%i (%s)\n",
+ errno, strerror(errno));
+ else
+ ODP_ERR("unsupported VFIO API version!\n");
+ close(vfio_container_fd);
+ odp_spinlock_unlock(&vfio_thread_spinlock);
+ return -1;
+ }
+
+ /* check if we support IOMMU type 1 */
+ ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU);
+ if (ret != 1) {
+ if (ret < 0)
+ ODP_ERR("could not get IOMMU type, "
+ "error %i (%s)\n", errno,
+ strerror(errno));
+ else
+ ODP_ERR("unsupported IOMMU type "
+ "detected in VFIO\n");
+ close(vfio_container_fd);
+ odp_spinlock_unlock(&vfio_thread_spinlock);
+ return -1;
+ }
+
+ pci_vfio_data.container_fd = vfio_container_fd;
+ pci_vfio_data.container_ref_count = 1;
+ odp_spinlock_unlock(&vfio_thread_spinlock);
+ return vfio_container_fd;
+}
+
+static void pci_vfio_release_container_fd(int container_fd)
+{
+ odp_spinlock_lock(&vfio_thread_spinlock);
+
+ if (container_fd != pci_vfio_data.container_fd) {
+ ODP_ERR("Attempt to release an invalid container fd failed");
+ odp_spinlock_unlock(&vfio_thread_spinlock);
+ return;
+ }
+
+ if (--pci_vfio_data.container_ref_count > 0) {
+ odp_spinlock_unlock(&vfio_thread_spinlock);
+ return;
+ }
+
+ pci_vfio_data.container_ref_count = 0;
+ close(pci_vfio_data.container_fd);
+ pci_vfio_data.container_fd = -1;
+ odp_spinlock_unlock(&vfio_thread_spinlock);
+}
+
+/*
+ * get IOMMU group number for a PCI device
+ * returns -1 for errors, 0 for non-existent group
+ */
+static int pci_vfio_get_group_no(const char *pci_addr)
+{
+ char linkname[PATH_MAX];
+ char filename[PATH_MAX];
+ char *tok[16], *group_tok, *end;
+ int ret, iommu_group_no;
+
+ memset(linkname, 0, sizeof(linkname));
+ memset(filename, 0, sizeof(filename));
+
+ /* try to find out IOMMU group for this device */
+ snprintf(linkname, sizeof(linkname),
+ SYSFS_PCI_DEVICES "/%s/iommu_group", pci_addr);
+
+ ret = readlink(linkname, filename, sizeof(filename));
+
+ /* if the link doesn't exist, no VFIO for us */
+ if (ret < 0) {
+ ODP_ERR("%s not managed by VFIO driver, skipping\n",
+ pci_addr);
+ ODP_ERR("linkname was %s, error: %s\n",
+ linkname, strerror(errno));
+ return -1;
+ }
+
+ ret = strsplit(filename, sizeof(filename), tok, DIM(tok), '/');
+
+ if (ret <= 0) {
+ ODP_ERR("%s cannot get IOMMU group\n", pci_addr);
+ return -1;
+ }
+
+ /* IOMMU group is always the last token */
+ errno = 0;
+ group_tok = tok[ret - 1];
+ end = group_tok;
+ iommu_group_no = strtol(group_tok, &end, 10);
+ if ((end != group_tok && *end != '\0') || errno != 0) {
+ ODP_ERR("%s error parsing IOMMU number!\n", pci_addr);
+ return -1;
+ }
+
+ return iommu_group_no;
+}
+
+/*
+ * open group fd
+ */
+static int pci_vfio_get_group_fd(int iommu_group_no)
+{
+ int vfio_group_fd;
+ char filename[PATH_MAX];
+ int index, free_index;
+
+ free_index = -1;
+
+ /* check if the group fd is already known:
+ * Also search for a free slot in the table, in case we need one */
+ odp_spinlock_lock(&vfio_thread_spinlock);
+ for (index = 0; index < MAX_PCI_VFIO_GRP; index++) {
+ if ((pci_vfio_data.groups[index].group_no == iommu_group_no) &&
+ (pci_vfio_data.groups[index].refcount > 0)) {
+ pci_vfio_data.groups[index].refcount++;
+ odp_spinlock_unlock(&vfio_thread_spinlock);
+ return pci_vfio_data.groups[index].group_fd;
+ }
+
+ if ((pci_vfio_data.groups[index].group_no == -1) &&
+ (pci_vfio_data.groups[index].refcount == 0))
+ free_index = index;
+ }
+
+ if (free_index == -1) {
+ ODP_ERR("No space available in vfio grp table "
+ "(MAX_PCI_VFIO_GRP)");
+ odp_spinlock_unlock(&vfio_thread_spinlock);
+ return -1;
+ }
+ snprintf(filename, sizeof(filename),
+ VFIO_GROUP_FMT, iommu_group_no);
+ vfio_group_fd = open(filename, O_RDWR);
+ if (vfio_group_fd < 0) {
+ /* if file not found, it's not an error */
+ if (errno != ENOENT) {
+ ODP_ERR("Cannot open %s: %s\n", filename,
+ strerror(errno));
+ odp_spinlock_unlock(&vfio_thread_spinlock);
+ return -1;
+ }
+ odp_spinlock_unlock(&vfio_thread_spinlock);
+ return 0;
+ }
+
+ pci_vfio_data.groups[free_index].group_no = iommu_group_no;
+ pci_vfio_data.groups[free_index].refcount = 1;
+ pci_vfio_data.groups[free_index].group_fd = vfio_group_fd;
+ odp_spinlock_unlock(&vfio_thread_spinlock);
+ return vfio_group_fd;
+}
+
+/*
+ * release group fd
+ */
+static void pci_vfio_release_group_fd(int vfio_group_fd)
+{
+ int index;
+
+ /* decrement the file descriptor refcount. close if last. */
+ odp_spinlock_lock(&vfio_thread_spinlock);
+ for (index = 0; index < MAX_PCI_VFIO_GRP; index++) {
+ if (pci_vfio_data.groups[index].group_fd == vfio_group_fd) {
+ if (--pci_vfio_data.groups[index].refcount == 0) {
+ pci_vfio_data.groups[index].group_no = -1;
+ close(vfio_group_fd);
+ odp_spinlock_unlock(&vfio_thread_spinlock);
+ return;
+ }
+ }
+ }
+ ODP_ERR("vfio: attempt to close a unreferenced group file descriptor");
+ odp_spinlock_unlock(&vfio_thread_spinlock);
+}
+
+/*
+ * check vfio group viability
+ * A group is said to be viable if all its members (i.e. devices)
+ * are either bound to VFIO or not bound at all
+ */
+static int pci_vfio_check_group_viable(const char *pci_addr, int vfio_group_fd)
+{
+ int ret;
+ struct vfio_group_status group_status = {
+ .argsz = sizeof(group_status)
+ };
+
+ /* check if the group is viable */
+ ret = ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status);
+ if (ret) {
+ ODP_ERR("%s can't get group status, error %i (%s)\n",
+ pci_addr, errno, strerror(errno));
+ close(vfio_group_fd);
+ return -1;
+ } else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
+ ODP_ERR("%s VFIO group is not viable!\n", pci_addr);
+ close(vfio_group_fd);
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * add group to container, if group does not have a container, already
+ */
+static int pci_vfio_add_group_to_container(const char *pci_addr,
+ int vfio_group_fd,
+ int vfio_container_fd)
+{
+ int ret;
+ struct vfio_group_status group_status = {
+ .argsz = sizeof(group_status)
+ };
+
+ /* check if the group already has a container... */
+ ret = ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status);
+ if (ret) {
+ ODP_ERR("%s cannot get group status, error %i (%s)\n",
+ pci_addr, errno, strerror(errno));
+ close(vfio_group_fd);
+ return -1;
+ }
+
+ /* if not, add the group to the container */
+ if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) {
+ ODP_ERR("Adding group into container ...\n");
+ ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER,
+ &vfio_container_fd);
+ if (ret) {
+ ODP_ERR("%s can't add VFIO group to container "
+ "error %i (%s)\n",
+ pci_addr, errno, strerror(errno));
+ close(vfio_group_fd);
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * set iommu type to VFIO_TYPE1_IOMMU
+ */
+static int pcio_vfio_set_mmu_type1(int vfio_container_fd)
+{
+ int ret;
+
+ odp_spinlock_lock(&vfio_thread_spinlock);
+ if (pci_vfio_data.iommu_type > 0) {
+ odp_spinlock_unlock(&vfio_thread_spinlock);
+ return 0; /* already done */
+ }
+
+ ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
+ VFIO_TYPE1_IOMMU);
+ if (ret) {
+ ODP_ERR("cannot set IOMMU type, "
+ "error %i (%s)\n", errno, strerror(errno));
+ odp_spinlock_unlock(&vfio_thread_spinlock);
+ return -1;
+ }
+
+ pci_vfio_data.iommu_type = VFIO_TYPE1_IOMMU;
+ odp_spinlock_unlock(&vfio_thread_spinlock);
+ return 0;
+}
+
+/*
+ * returns true if the given DMA mapping fragment
+ * already exists for the container
+ */
+static bool dma_mapping_exists(dma_map_t *region_f)
+{
+ int i;
+ dma_map_t *map;
+
+ for (i = 0; i < MAX_PCI_VFIO_DMA_FRAGS; i++) {
+ map = pci_vfio_data.dma_maps[i];
+ if (map == NULL)
+ continue;
+ if ((region_f->dma_addr == map->dma_addr) &&
+ (region_f->size == map->size))
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * set up DMA mappings.
+ */
+int _odp_pci_vfio_map_dma_region_f(pci_dev_t *dev, dma_map_t *fragment)
+{
+ int ret;
+ int i, free_index;
+ vfio_t *vfio = (vfio_t *)dev->implementation_data;
+ struct vfio_iommu_type1_dma_map dma_map;
+
+ odp_spinlock_lock(&vfio_thread_spinlock);
+
+ /* search for free room for this new fragment: */
+ free_index = -1;
+ for (i = 0; i < MAX_PCI_VFIO_DMA_FRAGS; i++) {
+ if (pci_vfio_data.dma_maps[i] == NULL) {
+ free_index = i;
+ break;
+ }
+ }
+
+ if (free_index == -1) {
+ ODP_DBG("DMA MAPPING: No room left for new DMA mapping");
+ odp_spinlock_unlock(&vfio_thread_spinlock);
+ return -1;
+ }
+
+ /*check that no equivalent fragment (mapping the same area) exist: */
+ if (dma_mapping_exists(fragment)) {
+ odp_spinlock_unlock(&vfio_thread_spinlock);
+ return 0;
+ }
+
+ /* perform the DMA mapping */
+ memset(&dma_map, 0, sizeof(dma_map));
+ dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+ dma_map.size = ODP_PAGE_SIZE_ROUNDUP(fragment->size);
+ dma_map.iova = fragment->dma_addr;
+ dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+ dma_map.vaddr = (uint64_t)fragment->addr;
+ ODP_DBG("DMA MAPPING: iova addr: %lx to addr: %lx of size:%ld\n",
+ (uint64_t)dma_map.iova,
+ (uint64_t)dma_map.vaddr,
+ (uint64_t)dma_map.size);
+ ret = ioctl(vfio->container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+ if (ret) {
+ ODP_ERR("cannot set up DMA mapping, "
+ "error %i (%s)\n", errno, strerror(errno));
+ odp_spinlock_unlock(&vfio_thread_spinlock);
+ return -1;
+ }
+
+ /* remember this mapped fragment for this container */
+ pci_vfio_data.dma_maps[free_index] = fragment;
+ odp_spinlock_unlock(&vfio_thread_spinlock);
+ return 0;
+}
+
+/*
+ * remove DMA mappings.
+ */
+int _odp_pci_vfio_unmap_dma_region_f(pci_dev_t *dev, dma_map_t *fragment)
+{
+ int i;
+ int ret;
+ vfio_t *vfio = (vfio_t *)dev->implementation_data;
+ struct vfio_iommu_type1_dma_unmap dma_unmap;
+
+ odp_spinlock_lock(&vfio_thread_spinlock);
+ /* search this fragment in the list of known fragments */
+ for (i = 0; i < MAX_PCI_VFIO_DMA_FRAGS; i++)
+ if (pci_vfio_data.dma_maps[i] == fragment)
+ break;
+
+ /* fragment was never mapped: error! */
+ if ((i >= MAX_PCI_VFIO_DMA_FRAGS) ||
+ (pci_vfio_data.dma_maps[i] != fragment)) {
+ odp_spinlock_unlock(&vfio_thread_spinlock);
+ return -1;
+ }
+
+ /* mark this fragment as deleteted */
+ pci_vfio_data.dma_maps[i] = NULL;
+
+ /*check that no equivalent fragment (mapping the same area) remains: */
+ if (dma_mapping_exists(fragment)) {
+ odp_spinlock_unlock(&vfio_thread_spinlock);
+ return 0;
+ }
+
+ /* perform the DMA unmapping */
+ memset(&dma_unmap, 0, sizeof(dma_unmap));
+ dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
+ dma_unmap.size = ODP_PAGE_SIZE_ROUNDUP(fragment->size);
+ dma_unmap.iova = fragment->dma_addr;
+ dma_unmap.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+ ODP_DBG("DMA UNMAPPING: iova addr: %lx of size:%ld\n",
+ (uint64_t)dma_unmap.iova,
+ (uint64_t)dma_unmap.size);
+ ret = ioctl(vfio->container_fd, VFIO_IOMMU_UNMAP_DMA, &dma_unmap);
+ if (ret) {
+ ODP_ERR("cannot remove DMA mapping, "
+ "error %i (%s)\n", errno, strerror(errno));
+ odp_spinlock_unlock(&vfio_thread_spinlock);
+ return -1;
+ }
+
+ odp_spinlock_unlock(&vfio_thread_spinlock);
+ return 0;
+}
+
+/*
+ * get device file descriptor
+ */
+static int pci_vfio_get_device_fd(int vfio_group_fd, const char *pci_addr)
+{
+ int vfio_dev_fd;
+
+ /* get a file descriptor for the device */
+ vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, pci_addr);
+ if (vfio_dev_fd < 0) {
+ /* if we cannot get a device fd, this simply means that this
+ * particular port is not bound to VFIO
+ */
+ ODP_DBG("%s not managed by VFIO driver, skipping\n"
+ "error %i (%s)\n", pci_addr, errno, strerror(errno));
+ return -1;
+ }
+
+ return vfio_dev_fd;
+}
+
+/*
+ * map a particular resource from a device file
+ */
+static void *pci_map_resource(void *requested_addr, int fd,
+ off_t offset, size_t size, int additional_flags)
+{
+ void *mapaddr;
+
+ /* Map the PCI memory resource of device */
+ mapaddr = mmap(requested_addr, size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | additional_flags, fd, offset);
+ if (mapaddr == MAP_FAILED) {
+ ODP_ERR("%s(): cannot mmap(%d, %p, 0x%lx, 0x%lx): %s (%p)\n",
+ __func__, fd, requested_addr,
+ (unsigned long)size, (unsigned long)offset,
+ strerror(errno), mapaddr);
+ } else
+ ODP_DBG("%s(): mapped(%d, %p, 0x%lx, 0x%lx): %s (%p)\n",
+ __func__, fd, requested_addr,
+ (unsigned long)size, (unsigned long)offset,
+ strerror(errno), mapaddr);
+
+ return mapaddr;
+}
+
+/*
+ * check if dev with pci_addr is already registered as initialised by this
+ * linux thread. If not, mark it as initialised (TAS)
+ * Return true if the pci address is not already registered, false otherwise.
+ */
+static bool tas_device(const char *pci_addr)
+{
+ int i;
+ int free_idx;
+
+ free_idx = -1;
+ odp_spinlock_lock(&vfio_thread_spinlock);
+ for (i = 0; i < MAX_PCI_VFIO_DEVICES; i++) {
+ if (pci_vfio_data.dev_names[i] == NULL) {
+ free_idx = i;
+ continue;
+ }
+
+ if (strncmp(pci_vfio_data.dev_names[i],
+ pci_addr, _ODP_PCI_ADDR_LEN + 1) == 0) {
+ odp_spinlock_unlock(&vfio_thread_spinlock);
+ return false;
+ }
+ }
+
+ if (free_idx == -1) {
+ odp_spinlock_unlock(&vfio_thread_spinlock);
+ ODP_ERR("No space left for more vfio pci devices!");
+ return false; /* no space left -> prevent further init */
+ }
+
+ pci_vfio_data.dev_names[free_idx] = pci_addr;
+ odp_spinlock_unlock(&vfio_thread_spinlock);
+ return true;
+}
+
+/*
+ * remove the pci_address from the list of vfio registered devices
+ */
+static void rm_device(const char *pci_addr)
+{
+ int i;
+
+ odp_spinlock_lock(&vfio_thread_spinlock);
+ for (i = 0; i < MAX_PCI_VFIO_DEVICES; i++) {
+ if (pci_vfio_data.dev_names[i] == NULL)
+ continue;
+
+ if (strncmp(pci_vfio_data.dev_names[i],
+ pci_addr, _ODP_PCI_ADDR_LEN + 1) == 0) {
+ pci_vfio_data.dev_names[i] = NULL;
+ odp_spinlock_unlock(&vfio_thread_spinlock);
+ return;
+ }
+ }
+
+ odp_spinlock_unlock(&vfio_thread_spinlock);
+}
+
+/*
+ * device init
+ */
+int _odp_pci_vfio_init(const char *pci_addr, pci_dev_t *dev)
+{
+ int ret;
+ int i;
+ struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
+ struct vfio_region_info reg = { .argsz = sizeof(reg) };
+ vfio_t *vfio;
+
+ /* initialize linux thread common data (only first time) */
+ pci_vfio_data_init();
+
+ /* prevent double init on same device */
+ if (!tas_device(pci_addr))
+ return -1;
+
+ /* allocate memory for vfio specific data and link it to dev: */
+ vfio = malloc(sizeof(vfio_t));
+ if (!vfio)
+ goto out;
+ dev->implementation_data = vfio;
+
+ strncpy(dev->pci_address, pci_addr, _ODP_PCI_ADDR_LEN);
+
+ /* open a vfio container: */
+ vfio->container_fd = pci_vfio_get_container_fd();
+ if (vfio->container_fd == -1)
+ goto out1;
+
+ /* get group number the target PCI_ADDRESS belongs to:*/
+ vfio->iommu_group_no = pci_vfio_get_group_no(pci_addr);
+ if ((vfio->iommu_group_no == -1) || (vfio->iommu_group_no == 0))
+ goto out1;
+
+ /* get the actual group fd */
+ vfio->group_fd = pci_vfio_get_group_fd(vfio->iommu_group_no);
+ if (vfio->group_fd < 0)
+ goto out1;
+
+ /* Test the group is viable */
+ if (pci_vfio_check_group_viable(pci_addr, vfio->group_fd) < 0)
+ goto out1;
+
+ /* Add the group to the container, if needed: */
+ if (pci_vfio_add_group_to_container(pci_addr, vfio->group_fd,
+ vfio->container_fd) < 0)
+ goto out1;
+
+ /* set mmu type 1: */
+ if (pcio_vfio_set_mmu_type1(vfio->container_fd) < 0)
+ goto out1;
+
+ /* Get a file descriptor for the device */
+ vfio->dev_fd = pci_vfio_get_device_fd(vfio->group_fd, pci_addr);
+ if (vfio->dev_fd < 0)
+ goto out1;
+
+ /* Get pci device information (number of regions and interrupts...) */
+ ret = ioctl(vfio->dev_fd, VFIO_DEVICE_GET_INFO, &device_info);
+ if (ret) {
+ ODP_ERR("%s cannot get device info, "
+ "error %i (%s)\n", pci_addr, errno, strerror(errno));
+ close(vfio->dev_fd);
+ goto out1;
+ }
+ ODP_DBG("found %d regions in the device\n", device_info.num_regions);
+
+ /* map all the found BAR regions: */
+ for (i = VFIO_PCI_BAR0_REGION_INDEX;
+ i <= VFIO_PCI_BAR5_REGION_INDEX; i++) {
+ reg.index = i;
+ ret = ioctl(vfio->dev_fd, VFIO_DEVICE_GET_REGION_INFO, ®);
+ if (ret) {
+ ODP_ERR("%s cannot get device region info "
+ "error %i (%s)\n", pci_addr, errno,
+ strerror(errno));
+ continue;
+ }
+ ODP_DBG("REGION %d: size=%ld, offset=%lx:\n",
+ i, (long int)reg.size, (long int)reg.offset);
+
+ /* skip non-mmapable BARs */
+ if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) {
+ ODP_DBG("found non mappable region %d in the device\n",
+ reg.index);
+ continue;
+ }
+
+ /* skip non-valid regions (size 0) */
+ if (!reg.size) {
+ ODP_DBG("found zero-sized region %d in the device\n",
+ reg.index);
+ continue;
+ }
+
+ dev->bar_maps[i].addr = pci_map_resource(NULL, vfio->dev_fd,
+ reg.offset,
+ reg.size, 0);
+ dev->bar_maps[i].size = reg.size;
+ }
+
+ pci_vfio_set_bus_master(vfio->dev_fd);
+
+ /* device reset and go... */
+ ioctl(vfio->dev_fd, VFIO_DEVICE_RESET);
+
+ return 0;
+
+ /*FIXME: proper error handling with release of resource (close()) */
+out1:
+ free(vfio);
+out:
+ rm_device(pci_addr);
+ return -1;
+}
+
+/*
+ * device release
+ */
+void _odp_pci_vfio_release(pci_dev_t *dev)
+{
+ vfio_t *vfio = (vfio_t *)dev->implementation_data;
+
+ /*FIXME: check if DMA mapping still exists for the device
+ FIXME: or good enough to do that on final container closure...? */
+
+ /* release the config area file descriptor: */
+ close(vfio->config_fd);
+
+ /* release the device file descriptor: */
+ close(vfio->dev_fd);
+
+ /* release (if possible), the group file descriptor: */
+ pci_vfio_release_group_fd(vfio->group_fd);
+
+ /* release (if possible), the container file descriptor: */
+ pci_vfio_release_container_fd(vfio->container_fd);
+
+ /* release the vfio specific device data */
+ free(dev->implementation_data);
+
+ /* mark pci address as uninitialized: */
+ rm_device(dev->pci_address);
+}
Implementation of the PCI api, using pci-vfio. Signed-off-by: Christophe Milard <christophe.milard@linaro.org> --- platform/linux-generic/Makefile.am | 2 + .../linux-generic/include/odp_pci_vfio_internal.h | 92 +++ platform/linux-generic/odp_pci_vfio.c | 873 +++++++++++++++++++++ 3 files changed, 967 insertions(+) create mode 100644 platform/linux-generic/include/odp_pci_vfio_internal.h create mode 100644 platform/linux-generic/odp_pci_vfio.c