@@ -14,7 +14,7 @@ typedef u64 netdev_features_t;
enum {
NETIF_F_SG_BIT, /* Scatter/gather IO. */
NETIF_F_IP_CSUM_BIT, /* Can checksum TCP/UDP over IPv4. */
- __UNUSED_NETIF_F_1,
+ NETIF_F_HW_ULP_DDP_BIT, /* ULP direct data placement offload */
NETIF_F_HW_CSUM_BIT, /* Can checksum all the packets. */
NETIF_F_IPV6_CSUM_BIT, /* Can checksum TCP/UDP over IPV6 */
NETIF_F_HIGHDMA_BIT, /* Can DMA to high memory. */
@@ -168,6 +168,7 @@ enum {
#define NETIF_F_HW_HSR_TAG_RM __NETIF_F(HW_HSR_TAG_RM)
#define NETIF_F_HW_HSR_FWD __NETIF_F(HW_HSR_FWD)
#define NETIF_F_HW_HSR_DUP __NETIF_F(HW_HSR_DUP)
+#define NETIF_F_HW_ULP_DDP __NETIF_F(HW_ULP_DDP)
/* Finds the next feature with the highest number of the range of start till 0.
*/
@@ -1005,6 +1005,7 @@ struct dev_ifalias {
struct devlink;
struct tlsdev_ops;
+struct ulp_ddp_dev_ops;
struct netdev_name_node {
struct hlist_node hlist;
@@ -2024,6 +2025,10 @@ struct net_device {
const struct tlsdev_ops *tlsdev_ops;
#endif
+#if IS_ENABLED(CONFIG_ULP_DDP)
+ const struct ulp_ddp_dev_ops *ulp_ddp_ops;
+#endif
+
const struct header_ops *header_ops;
unsigned char operstate;
@@ -689,6 +689,7 @@ typedef unsigned char *sk_buff_data_t;
* CHECKSUM_UNNECESSARY (max 3)
* @dst_pending_confirm: need to confirm neighbour
* @decrypted: Decrypted SKB
+ * @ddp_crc: DDP or CRC offloaded
* @napi_id: id of the NAPI struct this skb came from
* @sender_cpu: (aka @napi_id) source CPU in XPS
* @secmark: security marking
@@ -870,6 +871,9 @@ struct sk_buff {
#ifdef CONFIG_TLS_DEVICE
__u8 decrypted:1;
#endif
+#ifdef CONFIG_ULP_DDP
+ __u8 ddp_crc:1;
+#endif
#ifdef CONFIG_NET_SCHED
__u16 tc_index; /* traffic control index */
@@ -66,6 +66,8 @@ struct inet_connection_sock_af_ops {
* @icsk_ulp_ops Pluggable ULP control hook
* @icsk_ulp_data ULP private data
* @icsk_clean_acked Clean acked data hook
+ * @icsk_ulp_ddp_ops Pluggable ULP direct data placement control hook
+ * @icsk_ulp_ddp_data ULP direct data placement private data
* @icsk_listen_portaddr_node hash to the portaddr listener hashtable
* @icsk_ca_state: Congestion control state
* @icsk_retransmits: Number of unrecovered [RTO] timeouts
@@ -96,6 +98,8 @@ struct inet_connection_sock {
const struct tcp_ulp_ops *icsk_ulp_ops;
void __rcu *icsk_ulp_data;
void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq);
+ const struct ulp_ddp_ulp_ops *icsk_ulp_ddp_ops;
+ void __rcu *icsk_ulp_ddp_data;
struct hlist_node icsk_listen_portaddr_node;
unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
__u8 icsk_ca_state:5,
new file mode 100644
@@ -0,0 +1,136 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * ulp_ddp.h
+ * Author: Boris Pismenny <borisp@mellanox.com>
+ * Copyright (C) 2021 Mellanox Technologies.
+ */
+#ifndef _ULP_DDP_H
+#define _ULP_DDP_H
+
+#include <linux/netdevice.h>
+#include <net/inet_connection_sock.h>
+#include <net/sock.h>
+
+/* limits returned by the offload driver, zero means don't care */
+struct ulp_ddp_limits {
+ int max_ddp_sgl_len;
+};
+
+enum ulp_ddp_type {
+ ULP_DDP_NVME = 1,
+};
+
+/**
+ * struct ulp_ddp_config - Generic ulp ddp configuration: tcp ddp IO queue
+ * config implementations must use this as the first member.
+ * Add new instances of ulp_ddp_config below (nvme-tcp, etc.).
+ */
+struct ulp_ddp_config {
+ enum ulp_ddp_type type;
+ unsigned char buf[];
+};
+
+/**
+ * struct nvme_tcp_ddp_config - nvme tcp ddp configuration for an IO queue
+ *
+ * @pfv: pdu version (e.g., NVME_TCP_PFV_1_0)
+ * @cpda: controller pdu data alignmend (dwords, 0's based)
+ * @dgst: digest types enabled.
+ * The netdev will offload crc if ddp_crc is supported.
+ * @queue_size: number of nvme-tcp IO queue elements
+ * @queue_id: queue identifier
+ * @cpu_io: cpu core running the IO thread for this queue
+ */
+struct nvme_tcp_ddp_config {
+ struct ulp_ddp_config cfg;
+
+ u16 pfv;
+ u8 cpda;
+ u8 dgst;
+ int queue_size;
+ int queue_id;
+ int io_cpu;
+};
+
+/**
+ * struct ulp_ddp_io - ulp ddp configuration for an IO request.
+ *
+ * @command_id: identifier on the wire associated with these buffers
+ * @nents: number of entries in the sg_table
+ * @sg_table: describing the buffers for this IO request
+ * @first_sgl: first SGL in sg_table
+ */
+struct ulp_ddp_io {
+ u32 command_id;
+ int nents;
+ struct sg_table sg_table;
+ struct scatterlist first_sgl[SG_CHUNK_SIZE];
+};
+
+/* struct ulp_ddp_dev_ops - operations used by an upper layer protocol to configure ddp offload
+ *
+ * @ulp_ddp_limits: limit the number of scatter gather entries per IO.
+ * the device driver can use this to limit the resources allocated per queue.
+ * @ulp_ddp_sk_add: add offload for the queue represennted by the socket+config pair.
+ * this function is used to configure either copy, crc or both offloads.
+ * @ulp_ddp_sk_del: remove offload from the socket, and release any device related resources.
+ * @ulp_ddp_setup: request copy offload for buffers associated with a command_id in ulp_ddp_io.
+ * @ulp_ddp_teardown: release offload resources association between buffers and command_id in
+ * ulp_ddp_io.
+ * @ulp_ddp_resync: respond to the driver's resync_request. Called only if resync is successful.
+ */
+struct ulp_ddp_dev_ops {
+ int (*ulp_ddp_limits)(struct net_device *netdev,
+ struct ulp_ddp_limits *limits);
+ int (*ulp_ddp_sk_add)(struct net_device *netdev,
+ struct sock *sk,
+ struct ulp_ddp_config *config);
+ void (*ulp_ddp_sk_del)(struct net_device *netdev,
+ struct sock *sk);
+ int (*ulp_ddp_setup)(struct net_device *netdev,
+ struct sock *sk,
+ struct ulp_ddp_io *io);
+ int (*ulp_ddp_teardown)(struct net_device *netdev,
+ struct sock *sk,
+ struct ulp_ddp_io *io,
+ void *ddp_ctx);
+ void (*ulp_ddp_resync)(struct net_device *netdev,
+ struct sock *sk, u32 seq);
+};
+
+#define ULP_DDP_RESYNC_REQ BIT(0)
+
+/**
+ * struct ulp_ddp_ulp_ops - Interface to register uppper layer Direct Data Placement (DDP) TCP offload
+ */
+struct ulp_ddp_ulp_ops {
+ /* NIC requests ulp to indicate if @seq is the start of a message */
+ bool (*resync_request)(struct sock *sk, u32 seq, u32 flags);
+ /* NIC driver informs the ulp that ddp teardown is done - used for async completions*/
+ void (*ddp_teardown_done)(void *ddp_ctx);
+};
+
+/**
+ * struct ulp_ddp_ctx - Generic ulp ddp context: device driver per queue contexts must
+ * use this as the first member.
+ */
+struct ulp_ddp_ctx {
+ enum ulp_ddp_type type;
+ unsigned char buf[];
+};
+
+static inline struct ulp_ddp_ctx *ulp_ddp_get_ctx(const struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ return (__force struct ulp_ddp_ctx *)icsk->icsk_ulp_ddp_data;
+}
+
+static inline void ulp_ddp_set_ctx(struct sock *sk, void *ctx)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ rcu_assign_pointer(icsk->icsk_ulp_ddp_data, ctx);
+}
+
+#endif //_ULP_DDP_H
@@ -454,4 +454,14 @@ config ETHTOOL_NETLINK
netlink. It provides better extensibility and some new features,
e.g. notification messages.
+config ULP_DDP
+ bool "ULP direct data placement offload"
+ default n
+ help
+ Direct Data Placement (DDP) offload enables ULP, such as
+ NVMe-TCP/iSCSI, to request the NIC to place ULP payload data
+ of a command response directly into kernel pages while
+ calculate/verify the data digest on ULP PDU as they go through
+ the NIC. Thus avoiding the costly per-byte overhead.
+
endif # if NET
@@ -71,6 +71,7 @@
#include <net/mpls.h>
#include <net/mptcp.h>
#include <net/page_pool.h>
+#include <net/ulp_ddp.h>
#include <linux/uaccess.h>
#include <trace/events/skb.h>
@@ -6295,9 +6296,14 @@ EXPORT_SYMBOL(pskb_extract);
*/
void skb_condense(struct sk_buff *skb)
{
+ bool is_ddp = false;
+
+#ifdef CONFIG_ULP_DDP
+ is_ddp = skb->ddp_crc;
+#endif
if (skb->data_len) {
if (skb->data_len > skb->end - skb->tail ||
- skb_cloned(skb))
+ skb_cloned(skb) || is_ddp)
return;
/* Nice, we can free page frag(s) right now */
@@ -73,6 +73,7 @@ const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = {
[NETIF_F_HW_HSR_TAG_RM_BIT] = "hsr-tag-rm-offload",
[NETIF_F_HW_HSR_FWD_BIT] = "hsr-fwd-offload",
[NETIF_F_HW_HSR_DUP_BIT] = "hsr-dup-offload",
+ [NETIF_F_HW_ULP_DDP_BIT] = "ulp-ddp-offload",
};
const char
@@ -5149,6 +5149,9 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
#ifdef CONFIG_TLS_DEVICE
nskb->decrypted = skb->decrypted;
+#endif
+#ifdef CONFIG_ULP_DDP
+ nskb->ddp_crc = skb->ddp_crc;
#endif
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
if (list)
@@ -5182,6 +5185,11 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
#ifdef CONFIG_TLS_DEVICE
if (skb->decrypted != nskb->decrypted)
goto end;
+#endif
+#ifdef CONFIG_ULP_DDP
+
+ if (skb->ddp_crc != nskb->ddp_crc)
+ goto end;
#endif
}
}
@@ -1830,6 +1830,9 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
#ifdef CONFIG_TLS_DEVICE
tail->decrypted != skb->decrypted ||
+#endif
+#ifdef CONFIG_ULP_DDP
+ tail->ddp_crc != skb->ddp_crc ||
#endif
thtail->doff != th->doff ||
memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
@@ -262,6 +262,9 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb)
#ifdef CONFIG_TLS_DEVICE
flush |= p->decrypted ^ skb->decrypted;
#endif
+#ifdef CONFIG_ULP_DDP
+ flush |= p->ddp_crc ^ skb->ddp_crc;
+#endif
if (flush || skb_gro_receive(p, skb)) {
mss = 1;