diff mbox

[API-NEXT,RFC,09/31] linux-generic: PCI using pci-vfio

Message ID 1452285014-60320-10-git-send-email-christophe.milard@linaro.org
State New
Headers show

Commit Message

Christophe Milard Jan. 8, 2016, 8:29 p.m. UTC
Implementation of the PCI api, using pci-vfio.

Signed-off-by: Christophe Milard <christophe.milard@linaro.org>
---
 platform/linux-generic/Makefile.am                 |   2 +
 .../linux-generic/include/odp_pci_vfio_internal.h  |  92 +++
 platform/linux-generic/odp_pci_vfio.c              | 873 +++++++++++++++++++++
 3 files changed, 967 insertions(+)
 create mode 100644 platform/linux-generic/include/odp_pci_vfio_internal.h
 create mode 100644 platform/linux-generic/odp_pci_vfio.c
diff mbox

Patch

diff --git a/platform/linux-generic/Makefile.am b/platform/linux-generic/Makefile.am
index 1e1015d..6a08811 100644
--- a/platform/linux-generic/Makefile.am
+++ b/platform/linux-generic/Makefile.am
@@ -108,6 +108,7 @@  noinst_HEADERS = \
 		  ${srcdir}/include/odp_packet_socket.h \
 		  ${srcdir}/include/odp_packet_tap.h \
 		  ${srcdir}/include/odp_pci_internal.h \
+		  ${srcdir}/include/odp_pci_vfio_internal.h \
 		  ${srcdir}/include/odp_pkt_queue_internal.h \
 		  ${srcdir}/include/odp_pool_internal.h \
 		  ${srcdir}/include/odp_queue_internal.h \
@@ -138,6 +139,7 @@  __LIB__libodp_la_SOURCES = \
 			   odp_packet.c \
 			   odp_packet_flags.c \
 			   odp_packet_io.c \
+			   odp_pci_vfio.c \
 			   pktio/io_ops.c \
 			   pktio/pktio_common.c \
 			   pktio/loop.c \
diff --git a/platform/linux-generic/include/odp_pci_vfio_internal.h b/platform/linux-generic/include/odp_pci_vfio_internal.h
new file mode 100644
index 0000000..a7c2fe1
--- /dev/null
+++ b/platform/linux-generic/include/odp_pci_vfio_internal.h
@@ -0,0 +1,92 @@ 
+/* Copyright (c) 2015, Linaro Limited
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier:     BSD-3-Clause
+ */
+
+/**
+ * @file
+ *
+ * ODP pci vfio - implementation internal
+ */
+
+#ifndef ODP_PCI_VFIO_INTERNAL_H_
+#define ODP_PCI_VFIO_INTERNAL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <odp/dma.h>
+#include <odp_dma_internal.h>
+#include <odp/pci.h>
+#include <odp_pci_internal.h>
+
+/**
+ * Initialise and map the PCI interface for the given pci address,
+ *
+ * @param pci_addr	pci address to the board that should be initialized and
+ *			mapped, i.e. something like: "0000:23:00.0"
+ *
+ * @returns 0 a pci device on success, NULL on error.
+ */
+int _odp_pci_vfio_init(const char *pci_addr, pci_dev_t *dev);
+
+/**
+ * Release a PCI device
+ *
+ * @param pci_dev	The PCI device to be realeased as returned by
+ *			_odp_pci_vfio_init()
+ */
+void _odp_pci_vfio_release(pci_dev_t *dev);
+
+/**
+ * map a DMA region fragment.
+ *
+ * @param dev		the pci device as returned by odp_pci_init
+ * @param region	the DMA region descriptor.
+ *
+ * @returns 0 on success or a negative value on error
+ */
+int _odp_pci_vfio_map_dma_region_f(pci_dev_t *dev, dma_map_t *region);
+
+/**
+ * unmap a DMA region fragment.
+ *
+ * @param dev		the pci device as returned by odp_pci_init
+ * @param region	the DMA region descriptor.
+ *
+ * @returns 0 on success or a negative value on error
+ */
+int _odp_pci_vfio_unmap_dma_region_f(pci_dev_t *dev, dma_map_t *region);
+
+/**
+ * Read the PCI config area
+ *
+ * @param dev		the pci device as returned by odp_pci_init
+ * @param buf		where to write the read data
+ * @param len		the number of bytes to read
+ * @param offs		the offset where read should start
+ *
+ * @returns the number of read bytes
+ */
+int _odp_pci_vfio_read_config(pci_dev_t *dev, void *buf, size_t len, int offs);
+
+/**
+ * Write the PCI config area
+ *
+ * @param dev		the pci device as returned by odp_pci_init
+ * @param buf		where to take the data to be written
+ * @param len		the number of bytes to written
+ * @param offs		the offset where data should be written
+ *
+ * @returns the number of written bytes
+ */
+int _odp_pci_vfio_write_config(pci_dev_t *dev, const void *buf,
+			       size_t len, int offs);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ODP_PCI_VFIO_INTERNAL_H_ */
diff --git a/platform/linux-generic/odp_pci_vfio.c b/platform/linux-generic/odp_pci_vfio.c
new file mode 100644
index 0000000..adddbd8
--- /dev/null
+++ b/platform/linux-generic/odp_pci_vfio.c
@@ -0,0 +1,873 @@ 
+/* Copyright (c) 2015, Linaro Limited
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier:     BSD-3-Clause
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <linux/vfio.h>
+#include <linux/limits.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <linux/pci_regs.h>
+#include <unistd.h>
+#include <odp/pci.h>
+#include <odp/shared_memory.h>
+#include <odp_debug_internal.h>
+#include <odp_pci_vfio_internal.h>
+#include <odp_align_internal.h>
+#include <odp/spinlock.h>
+
+#define VFIO_CONTAINER_PATH "/dev/vfio/vfio"
+#define SYSFS_PCI_DEVICES "/sys/bus/pci/devices"
+#define VFIO_GROUP_FMT "/dev/vfio/%u"
+/* The vfio-pci driver harcodes map region n at address n<<40: */
+/* see VFIO_PCI_OFFSET_SHIFT in linux */
+#define VFIO_GET_REGION_ADDR(x) ((uint64_t)x << 40ULL)
+
+#define	DIM(a)	(sizeof(a) / sizeof((a)[0]))
+
+#define MAX_PCI_VFIO_GRP	 5
+#define MAX_PCI_VFIO_DEVICES	 8
+#define MAX_PCI_VFIO_DMA_FRAGS   50
+
+/*
+ * pci_vfio uses file descriptors to access differents objects.
+ * Access to data structures from different linux threads are protected
+ * by the following spinlock
+ */
+odp_spinlock_t vfio_thread_spinlock;
+
+/**
+ * device implementation data for the VFIO implementation of the PCI interface
+ */
+typedef struct vfio_t {
+	int iommu_group_no;
+	int container_fd;
+	int group_fd;
+	int dev_fd;
+	int config_fd;
+} vfio_t;
+
+/**
+ * data common to all pci vfio instances of this linux thread
+ */
+typedef struct pci_vfio_data_t {
+	int container_fd;	/* odp uses one container */
+	int container_ref_count;
+	int iommu_type;		/* iommu type set for this container */
+	dma_map_t *dma_maps[MAX_PCI_VFIO_DMA_FRAGS]; /* mapped regions */
+	const char *dev_names[MAX_PCI_VFIO_DEVICES]; /*initialised devices */
+	struct {
+		int group_no;
+		int group_fd;
+		int refcount;
+	} groups[MAX_PCI_VFIO_GRP];
+} pci_vfio_data_t;
+
+static pci_vfio_data_t pci_vfio_data;
+
+/* set PCI bus mastering */
+static int pci_vfio_set_bus_master(int dev_fd)
+{
+	uint16_t reg;
+	int ret;
+
+	ret = pread64(dev_fd, &reg, sizeof(reg),
+		      VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+		      PCI_COMMAND);
+	if (ret != sizeof(reg)) {
+		ODP_ERR("Cannot read command from PCI config space!\n");
+		return -1;
+	}
+
+	/* set the master bit */
+	reg |= PCI_COMMAND_MASTER;
+
+	ret = pwrite64(dev_fd, &reg, sizeof(reg),
+		       VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+		       PCI_COMMAND);
+
+	if (ret != sizeof(reg)) {
+		ODP_ERR("Cannot write command to PCI config space!\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * read pci device configuration space
+ */
+int _odp_pci_vfio_read_config(pci_dev_t *dev, void *buf, size_t len, int offs)
+{
+	vfio_t *vfio = (vfio_t *)dev->implementation_data;
+
+	return pread64(vfio->dev_fd, buf, len,
+		       VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX)
+		       + offs);
+}
+
+/*
+ * write pci device configuration space
+ */
+int _odp_pci_vfio_write_config(pci_dev_t *dev,
+			       const void *buf, size_t len, int offs)
+{
+	vfio_t *vfio = (vfio_t *)dev->implementation_data;
+
+	return pwrite64(vfio->dev_fd, buf, len,
+			VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX)
+			+ offs);
+}
+
+/*
+ * split string into tokens
+ */
+static int strsplit(char *string, int stringlen,
+		    char **tokens, int maxtokens, char delim)
+{
+	int i, tok = 0;
+	int tokstart = 1; /* first token is right at start of string */
+
+	if (string == NULL || tokens == NULL)
+		goto einval_error;
+
+	for (i = 0; i < stringlen; i++) {
+		if (string[i] == '\0' || tok >= maxtokens)
+			break;
+		if (tokstart) {
+			tokstart = 0;
+			tokens[tok++] = &string[i];
+		}
+		if (string[i] == delim) {
+			string[i] = '\0';
+			tokstart = 1;
+		}
+	}
+	return tok;
+
+einval_error:
+	return -1;
+}
+
+/**
+ * initialize linux thread common data
+ */
+static void pci_vfio_data_init(void)
+{
+	int i;
+
+	odp_spinlock_lock(&vfio_thread_spinlock);
+
+	pci_vfio_data.container_fd = -1;
+	pci_vfio_data.container_ref_count = 0;
+	pci_vfio_data.iommu_type = -1;
+	for (i = 0; i < MAX_PCI_VFIO_GRP; i++) {
+		pci_vfio_data.groups[i].group_no = -1;
+		pci_vfio_data.groups[i].group_fd = -1;
+		pci_vfio_data.groups[i].refcount = 0;
+	}
+	/* set all DMA fragments as free for this container: */
+	for (i = 0; i < MAX_PCI_VFIO_DMA_FRAGS; i++)
+		pci_vfio_data.dma_maps[i] = NULL;
+
+	odp_spinlock_unlock(&vfio_thread_spinlock);
+}
+
+/*
+ * open a vfio container.
+ * return: the container file descriptor (fd) or -1 on error
+ */
+static int pci_vfio_get_container_fd(void)
+{
+	int ret, vfio_container_fd;
+
+	/* initialise common vfio area  */
+	odp_spinlock_lock(&vfio_thread_spinlock);
+
+	/* if the file descriptor for the container already exists, return it */
+	if (pci_vfio_data.container_fd != -1) {
+		pci_vfio_data.container_ref_count++;
+		odp_spinlock_unlock(&vfio_thread_spinlock);
+		return pci_vfio_data.container_fd;
+	}
+
+	/* try to open the vfio container: */
+	vfio_container_fd = open(VFIO_CONTAINER_PATH, O_RDWR);
+	if (vfio_container_fd < 0) {
+		ODP_ERR("cannot open VFIO container, "
+			"error %i (%s)\n", errno, strerror(errno));
+		odp_spinlock_unlock(&vfio_thread_spinlock);
+		return -1;
+	}
+
+	/* check the VFIO API version */
+	ret = ioctl(vfio_container_fd, VFIO_GET_API_VERSION);
+	if (ret != VFIO_API_VERSION) {
+		if (ret < 0)
+			ODP_ERR("couldn't get VFIO API version, err=%i (%s)\n",
+				errno, strerror(errno));
+		else
+			ODP_ERR("unsupported VFIO API version!\n");
+		close(vfio_container_fd);
+		odp_spinlock_unlock(&vfio_thread_spinlock);
+		return -1;
+	}
+
+	/* check if we support IOMMU type 1 */
+	ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU);
+	if (ret != 1) {
+		if (ret < 0)
+			ODP_ERR("could not get IOMMU type, "
+				"error %i (%s)\n", errno,
+				strerror(errno));
+		else
+			ODP_ERR("unsupported IOMMU type "
+				"detected in VFIO\n");
+		close(vfio_container_fd);
+		odp_spinlock_unlock(&vfio_thread_spinlock);
+		return -1;
+	}
+
+	pci_vfio_data.container_fd = vfio_container_fd;
+	pci_vfio_data.container_ref_count = 1;
+	odp_spinlock_unlock(&vfio_thread_spinlock);
+	return vfio_container_fd;
+}
+
+static void pci_vfio_release_container_fd(int container_fd)
+{
+	odp_spinlock_lock(&vfio_thread_spinlock);
+
+	if (container_fd != pci_vfio_data.container_fd) {
+		ODP_ERR("Attempt to release an invalid container fd failed");
+		odp_spinlock_unlock(&vfio_thread_spinlock);
+		return;
+	}
+
+	if (--pci_vfio_data.container_ref_count > 0) {
+		odp_spinlock_unlock(&vfio_thread_spinlock);
+		return;
+	}
+
+	pci_vfio_data.container_ref_count = 0;
+	close(pci_vfio_data.container_fd);
+	pci_vfio_data.container_fd = -1;
+	odp_spinlock_unlock(&vfio_thread_spinlock);
+}
+
+/*
+ * get IOMMU group number for a PCI device
+ * returns -1 for errors, 0 for non-existent group
+ */
+static int pci_vfio_get_group_no(const char *pci_addr)
+{
+	char linkname[PATH_MAX];
+	char filename[PATH_MAX];
+	char *tok[16], *group_tok, *end;
+	int ret, iommu_group_no;
+
+	memset(linkname, 0, sizeof(linkname));
+	memset(filename, 0, sizeof(filename));
+
+	/* try to find out IOMMU group for this device */
+	snprintf(linkname, sizeof(linkname),
+		 SYSFS_PCI_DEVICES "/%s/iommu_group", pci_addr);
+
+	ret = readlink(linkname, filename, sizeof(filename));
+
+	/* if the link doesn't exist, no VFIO for us */
+	if (ret < 0) {
+		ODP_ERR("%s not managed by VFIO driver, skipping\n",
+			pci_addr);
+		ODP_ERR("linkname was %s, error: %s\n",
+			linkname, strerror(errno));
+		return -1;
+	}
+
+	ret = strsplit(filename, sizeof(filename), tok, DIM(tok), '/');
+
+	if (ret <= 0) {
+		ODP_ERR("%s cannot get IOMMU group\n", pci_addr);
+		return -1;
+	}
+
+	/* IOMMU group is always the last token */
+	errno = 0;
+	group_tok = tok[ret - 1];
+	end = group_tok;
+	iommu_group_no = strtol(group_tok, &end, 10);
+	if ((end != group_tok && *end != '\0') || errno != 0) {
+		ODP_ERR("%s error parsing IOMMU number!\n", pci_addr);
+		return -1;
+	}
+
+	return iommu_group_no;
+}
+
+/*
+ * open group fd
+ */
+static int pci_vfio_get_group_fd(int iommu_group_no)
+{
+	int vfio_group_fd;
+	char filename[PATH_MAX];
+	int index, free_index;
+
+	free_index = -1;
+
+	/* check if the group fd is already known:
+	 * Also search for a free slot in the table, in case we need one */
+	odp_spinlock_lock(&vfio_thread_spinlock);
+	for (index = 0; index < MAX_PCI_VFIO_GRP; index++) {
+		if ((pci_vfio_data.groups[index].group_no == iommu_group_no) &&
+		    (pci_vfio_data.groups[index].refcount > 0)) {
+			pci_vfio_data.groups[index].refcount++;
+			odp_spinlock_unlock(&vfio_thread_spinlock);
+			return pci_vfio_data.groups[index].group_fd;
+		}
+
+		if ((pci_vfio_data.groups[index].group_no == -1) &&
+		    (pci_vfio_data.groups[index].refcount == 0))
+			free_index = index;
+	}
+
+	if (free_index == -1) {
+		ODP_ERR("No space available in vfio grp table "
+			"(MAX_PCI_VFIO_GRP)");
+		odp_spinlock_unlock(&vfio_thread_spinlock);
+		return -1;
+	}
+	snprintf(filename, sizeof(filename),
+		 VFIO_GROUP_FMT, iommu_group_no);
+	vfio_group_fd = open(filename, O_RDWR);
+	if (vfio_group_fd < 0) {
+		/* if file not found, it's not an error */
+		if (errno != ENOENT) {
+			ODP_ERR("Cannot open %s: %s\n", filename,
+				strerror(errno));
+			odp_spinlock_unlock(&vfio_thread_spinlock);
+			return -1;
+		}
+		odp_spinlock_unlock(&vfio_thread_spinlock);
+		return 0;
+	}
+
+	pci_vfio_data.groups[free_index].group_no = iommu_group_no;
+	pci_vfio_data.groups[free_index].refcount = 1;
+	pci_vfio_data.groups[free_index].group_fd = vfio_group_fd;
+	odp_spinlock_unlock(&vfio_thread_spinlock);
+	return vfio_group_fd;
+}
+
+/*
+ * release group fd
+ */
+static void pci_vfio_release_group_fd(int vfio_group_fd)
+{
+	int index;
+
+	/* decrement the file descriptor refcount. close if last. */
+	odp_spinlock_lock(&vfio_thread_spinlock);
+	for (index = 0; index < MAX_PCI_VFIO_GRP; index++) {
+		if (pci_vfio_data.groups[index].group_fd == vfio_group_fd) {
+			if (--pci_vfio_data.groups[index].refcount == 0) {
+				pci_vfio_data.groups[index].group_no = -1;
+				close(vfio_group_fd);
+				odp_spinlock_unlock(&vfio_thread_spinlock);
+				return;
+			}
+		}
+	}
+	ODP_ERR("vfio: attempt to close a unreferenced group file descriptor");
+	odp_spinlock_unlock(&vfio_thread_spinlock);
+}
+
+/*
+ * check vfio group viability
+ * A group is said to be viable if all its members (i.e. devices)
+ * are either bound to VFIO or not bound at all
+ */
+static int pci_vfio_check_group_viable(const char *pci_addr, int vfio_group_fd)
+{
+	int ret;
+	struct vfio_group_status group_status = {
+			.argsz = sizeof(group_status)
+	};
+
+	/* check if the group is viable */
+	ret = ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status);
+	if (ret) {
+		ODP_ERR("%s can't get group status, error %i (%s)\n",
+			pci_addr, errno, strerror(errno));
+		close(vfio_group_fd);
+		return -1;
+	} else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
+		ODP_ERR("%s VFIO group is not viable!\n", pci_addr);
+		close(vfio_group_fd);
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * add group to container, if group does not have a container, already
+ */
+static int pci_vfio_add_group_to_container(const char *pci_addr,
+					   int vfio_group_fd,
+					   int vfio_container_fd)
+{
+	int ret;
+	struct vfio_group_status group_status = {
+			.argsz = sizeof(group_status)
+	};
+
+	/* check if the group already has a container... */
+	ret = ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status);
+	if (ret) {
+		ODP_ERR("%s cannot get group status, error %i (%s)\n",
+			pci_addr, errno, strerror(errno));
+		close(vfio_group_fd);
+		return -1;
+	}
+
+	/* if not, add the group to the container */
+	if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) {
+		ODP_ERR("Adding group into container ...\n");
+		ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER,
+			    &vfio_container_fd);
+		if (ret) {
+			ODP_ERR("%s can't add VFIO group to container "
+				"error %i (%s)\n",
+				pci_addr, errno, strerror(errno));
+			close(vfio_group_fd);
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ *  set iommu type to VFIO_TYPE1_IOMMU
+ */
+static int pcio_vfio_set_mmu_type1(int vfio_container_fd)
+{
+	int ret;
+
+	odp_spinlock_lock(&vfio_thread_spinlock);
+	if (pci_vfio_data.iommu_type > 0) {
+		odp_spinlock_unlock(&vfio_thread_spinlock);
+		return 0; /* already done */
+	}
+
+	ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
+		    VFIO_TYPE1_IOMMU);
+	if (ret) {
+		ODP_ERR("cannot set IOMMU type, "
+				"error %i (%s)\n", errno, strerror(errno));
+		odp_spinlock_unlock(&vfio_thread_spinlock);
+		return -1;
+	}
+
+	pci_vfio_data.iommu_type = VFIO_TYPE1_IOMMU;
+	odp_spinlock_unlock(&vfio_thread_spinlock);
+	return 0;
+}
+
+/*
+ * returns true if the given DMA mapping fragment
+ * already exists for the container
+ */
+static bool dma_mapping_exists(dma_map_t *region_f)
+{
+	int i;
+	dma_map_t *map;
+
+	for (i = 0; i < MAX_PCI_VFIO_DMA_FRAGS; i++) {
+		map = pci_vfio_data.dma_maps[i];
+		if (map == NULL)
+			continue;
+		if ((region_f->dma_addr == map->dma_addr) &&
+		    (region_f->size == map->size))
+			return true;
+	}
+
+	return false;
+}
+
+/*
+ * set up DMA mappings.
+ */
+int _odp_pci_vfio_map_dma_region_f(pci_dev_t *dev, dma_map_t *fragment)
+{
+	int ret;
+	int i, free_index;
+	vfio_t *vfio = (vfio_t *)dev->implementation_data;
+	struct vfio_iommu_type1_dma_map dma_map;
+
+	odp_spinlock_lock(&vfio_thread_spinlock);
+
+	/* search for free room for this new fragment: */
+	free_index = -1;
+	for (i = 0; i < MAX_PCI_VFIO_DMA_FRAGS; i++) {
+		if (pci_vfio_data.dma_maps[i] == NULL) {
+			free_index = i;
+			break;
+		}
+	}
+
+	if (free_index == -1) {
+		ODP_DBG("DMA MAPPING: No room left for new DMA mapping");
+		odp_spinlock_unlock(&vfio_thread_spinlock);
+		return -1;
+	}
+
+	/*check that no equivalent fragment (mapping the same area) exist: */
+	if (dma_mapping_exists(fragment)) {
+		odp_spinlock_unlock(&vfio_thread_spinlock);
+		return 0;
+	}
+
+	/* perform the DMA mapping */
+	memset(&dma_map, 0, sizeof(dma_map));
+	dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+	dma_map.size = ODP_PAGE_SIZE_ROUNDUP(fragment->size);
+	dma_map.iova = fragment->dma_addr;
+	dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+	dma_map.vaddr = (uint64_t)fragment->addr;
+	ODP_DBG("DMA MAPPING: iova addr: %lx to addr: %lx of size:%ld\n",
+		(uint64_t)dma_map.iova,
+		(uint64_t)dma_map.vaddr,
+		(uint64_t)dma_map.size);
+	ret = ioctl(vfio->container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+	if (ret) {
+		ODP_ERR("cannot set up DMA mapping, "
+				"error %i (%s)\n", errno, strerror(errno));
+		odp_spinlock_unlock(&vfio_thread_spinlock);
+		return -1;
+	}
+
+	/* remember this mapped fragment for this container */
+	pci_vfio_data.dma_maps[free_index] = fragment;
+	odp_spinlock_unlock(&vfio_thread_spinlock);
+	return 0;
+}
+
+/*
+ * remove DMA mappings.
+ */
+int _odp_pci_vfio_unmap_dma_region_f(pci_dev_t *dev, dma_map_t *fragment)
+{
+	int i;
+	int ret;
+	vfio_t *vfio = (vfio_t *)dev->implementation_data;
+	struct vfio_iommu_type1_dma_unmap dma_unmap;
+
+	odp_spinlock_lock(&vfio_thread_spinlock);
+	/* search this fragment in the list of known fragments */
+	for (i = 0; i < MAX_PCI_VFIO_DMA_FRAGS; i++)
+		if (pci_vfio_data.dma_maps[i] == fragment)
+			break;
+
+	/* fragment was never mapped: error! */
+	if ((i >= MAX_PCI_VFIO_DMA_FRAGS) ||
+	    (pci_vfio_data.dma_maps[i] != fragment)) {
+		odp_spinlock_unlock(&vfio_thread_spinlock);
+		return -1;
+	}
+
+	/* mark this fragment as deleteted */
+	pci_vfio_data.dma_maps[i] = NULL;
+
+	/*check that no equivalent fragment (mapping the same area) remains: */
+	if (dma_mapping_exists(fragment)) {
+		odp_spinlock_unlock(&vfio_thread_spinlock);
+		return 0;
+	}
+
+	/* perform the DMA unmapping */
+	memset(&dma_unmap, 0, sizeof(dma_unmap));
+	dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
+	dma_unmap.size = ODP_PAGE_SIZE_ROUNDUP(fragment->size);
+	dma_unmap.iova = fragment->dma_addr;
+	dma_unmap.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+	ODP_DBG("DMA UNMAPPING: iova addr: %lx of size:%ld\n",
+		(uint64_t)dma_unmap.iova,
+		(uint64_t)dma_unmap.size);
+	ret = ioctl(vfio->container_fd, VFIO_IOMMU_UNMAP_DMA, &dma_unmap);
+	if (ret) {
+		ODP_ERR("cannot remove DMA mapping, "
+			"error %i (%s)\n", errno, strerror(errno));
+		odp_spinlock_unlock(&vfio_thread_spinlock);
+		return -1;
+	}
+
+	odp_spinlock_unlock(&vfio_thread_spinlock);
+	return 0;
+}
+
+/*
+ * get device file descriptor
+ */
+static int pci_vfio_get_device_fd(int vfio_group_fd, const char *pci_addr)
+{
+	int vfio_dev_fd;
+
+	/* get a file descriptor for the device */
+	vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, pci_addr);
+	if (vfio_dev_fd < 0) {
+		/* if we cannot get a device fd, this simply means that this
+		 * particular port is not bound to VFIO
+		 */
+		ODP_DBG("%s not managed by VFIO driver, skipping\n"
+			"error %i (%s)\n", pci_addr, errno, strerror(errno));
+		return -1;
+	}
+
+	return vfio_dev_fd;
+}
+
+/*
+ * map a particular resource  from a device file
+ */
+static void *pci_map_resource(void *requested_addr, int fd,
+			      off_t offset, size_t size, int additional_flags)
+{
+	void *mapaddr;
+
+	/* Map the PCI memory resource of device */
+	mapaddr = mmap(requested_addr, size, PROT_READ | PROT_WRITE,
+		       MAP_SHARED | additional_flags, fd, offset);
+	if (mapaddr == MAP_FAILED) {
+		ODP_ERR("%s(): cannot mmap(%d, %p, 0x%lx, 0x%lx): %s (%p)\n",
+			__func__, fd, requested_addr,
+			(unsigned long)size, (unsigned long)offset,
+			strerror(errno), mapaddr);
+	} else
+		ODP_DBG("%s(): mapped(%d, %p, 0x%lx, 0x%lx): %s (%p)\n",
+			__func__, fd, requested_addr,
+			(unsigned long)size, (unsigned long)offset,
+			strerror(errno), mapaddr);
+
+	return mapaddr;
+}
+
+/*
+ * check if dev with pci_addr is already registered as initialised by this
+ * linux thread. If not, mark it as initialised (TAS)
+ * Return true if the pci address is not already registered, false otherwise.
+ */
+static bool tas_device(const char *pci_addr)
+{
+	int i;
+	int free_idx;
+
+	free_idx = -1;
+	odp_spinlock_lock(&vfio_thread_spinlock);
+	for (i = 0; i < MAX_PCI_VFIO_DEVICES; i++) {
+		if (pci_vfio_data.dev_names[i] == NULL) {
+			free_idx = i;
+			continue;
+		}
+
+		if (strncmp(pci_vfio_data.dev_names[i],
+			    pci_addr, _ODP_PCI_ADDR_LEN + 1) == 0) {
+			odp_spinlock_unlock(&vfio_thread_spinlock);
+			return false;
+		}
+	}
+
+	if (free_idx == -1) {
+		odp_spinlock_unlock(&vfio_thread_spinlock);
+		ODP_ERR("No space left for more vfio pci devices!");
+		return false; /* no space left -> prevent further init */
+	}
+
+	pci_vfio_data.dev_names[free_idx] = pci_addr;
+	odp_spinlock_unlock(&vfio_thread_spinlock);
+	return true;
+}
+
+/*
+ * remove the pci_address from the list of vfio registered devices
+ */
+static void rm_device(const char *pci_addr)
+{
+	int i;
+
+	odp_spinlock_lock(&vfio_thread_spinlock);
+	for (i = 0; i < MAX_PCI_VFIO_DEVICES; i++) {
+		if (pci_vfio_data.dev_names[i] == NULL)
+			continue;
+
+		if (strncmp(pci_vfio_data.dev_names[i],
+			    pci_addr, _ODP_PCI_ADDR_LEN + 1) == 0) {
+			pci_vfio_data.dev_names[i] = NULL;
+			odp_spinlock_unlock(&vfio_thread_spinlock);
+			return;
+		}
+	}
+
+	odp_spinlock_unlock(&vfio_thread_spinlock);
+}
+
+/*
+ * device init
+ */
+int _odp_pci_vfio_init(const char *pci_addr, pci_dev_t *dev)
+{
+	int ret;
+	int i;
+	struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
+	struct vfio_region_info reg = { .argsz = sizeof(reg) };
+	vfio_t *vfio;
+
+	/* initialize linux thread common data (only first time) */
+	pci_vfio_data_init();
+
+	/* prevent double init on same device */
+	if (!tas_device(pci_addr))
+		return -1;
+
+	/* allocate memory for vfio specific data and link it to dev: */
+	vfio = malloc(sizeof(vfio_t));
+	if (!vfio)
+		goto out;
+	dev->implementation_data = vfio;
+
+	strncpy(dev->pci_address, pci_addr, _ODP_PCI_ADDR_LEN);
+
+	/* open a vfio container: */
+	vfio->container_fd = pci_vfio_get_container_fd();
+	if (vfio->container_fd == -1)
+		goto out1;
+
+	/* get group number the target PCI_ADDRESS belongs to:*/
+	vfio->iommu_group_no = pci_vfio_get_group_no(pci_addr);
+	if ((vfio->iommu_group_no == -1) || (vfio->iommu_group_no == 0))
+		goto out1;
+
+	/* get the actual group fd */
+	vfio->group_fd = pci_vfio_get_group_fd(vfio->iommu_group_no);
+	if (vfio->group_fd < 0)
+		goto out1;
+
+	/* Test the group is viable */
+	if (pci_vfio_check_group_viable(pci_addr, vfio->group_fd) < 0)
+		goto out1;
+
+	/* Add the group to the container, if needed: */
+	if (pci_vfio_add_group_to_container(pci_addr, vfio->group_fd,
+					    vfio->container_fd) < 0)
+		goto out1;
+
+	/* set mmu type 1: */
+	if (pcio_vfio_set_mmu_type1(vfio->container_fd) < 0)
+		goto out1;
+
+	/* Get a file descriptor for the device */
+	vfio->dev_fd = pci_vfio_get_device_fd(vfio->group_fd, pci_addr);
+	if (vfio->dev_fd < 0)
+		goto out1;
+
+	/* Get pci device information (number of regions and interrupts...) */
+	ret = ioctl(vfio->dev_fd, VFIO_DEVICE_GET_INFO, &device_info);
+	if (ret) {
+		ODP_ERR("%s cannot get device info, "
+			"error %i (%s)\n", pci_addr, errno, strerror(errno));
+		close(vfio->dev_fd);
+		goto out1;
+	}
+	ODP_DBG("found %d regions in the device\n", device_info.num_regions);
+
+	/* map all the found BAR regions: */
+	for (i = VFIO_PCI_BAR0_REGION_INDEX;
+	     i <= VFIO_PCI_BAR5_REGION_INDEX; i++) {
+		reg.index = i;
+		ret = ioctl(vfio->dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
+		if (ret) {
+			ODP_ERR("%s cannot get device region info "
+				"error %i (%s)\n", pci_addr, errno,
+				strerror(errno));
+			continue;
+		}
+		ODP_DBG("REGION %d: size=%ld, offset=%lx:\n",
+			i, (long int)reg.size, (long int)reg.offset);
+
+		/* skip non-mmapable BARs */
+		if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) {
+			ODP_DBG("found non mappable region %d in the device\n",
+				reg.index);
+			continue;
+		}
+
+		/* skip non-valid regions (size 0) */
+		if (!reg.size) {
+			ODP_DBG("found zero-sized region %d in the device\n",
+				reg.index);
+			continue;
+		}
+
+		dev->bar_maps[i].addr = pci_map_resource(NULL, vfio->dev_fd,
+							 reg.offset,
+							 reg.size, 0);
+		dev->bar_maps[i].size = reg.size;
+	}
+
+	pci_vfio_set_bus_master(vfio->dev_fd);
+
+	/* device reset and go... */
+	ioctl(vfio->dev_fd, VFIO_DEVICE_RESET);
+
+	return 0;
+
+	/*FIXME: proper error handling with release of resource (close()) */
+out1:
+	free(vfio);
+out:
+	rm_device(pci_addr);
+	return -1;
+}
+
+/*
+ * device release
+ */
+void _odp_pci_vfio_release(pci_dev_t *dev)
+{
+	vfio_t *vfio = (vfio_t *)dev->implementation_data;
+
+	/*FIXME: check if DMA mapping still exists for the device
+	  FIXME: or good enough to do that on final container closure...? */
+
+	/* release the config area file descriptor: */
+	close(vfio->config_fd);
+
+	/* release the device file descriptor: */
+	close(vfio->dev_fd);
+
+	/* release (if possible), the group file descriptor: */
+	pci_vfio_release_group_fd(vfio->group_fd);
+
+	/* release (if possible), the container file descriptor: */
+	pci_vfio_release_container_fd(vfio->container_fd);
+
+	/* release the vfio specific device data */
+	free(dev->implementation_data);
+
+	/* mark pci address as uninitialized: */
+	rm_device(dev->pci_address);
+}