@@ -674,18 +674,65 @@ struct rps_map {
};
#define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + ((_num) * sizeof(u16)))
+/* The rps_cpu_qid structure is sixteen bits and holds either a CPU number or
+ * a queue index. The use_qid field specifies which type of value is set (i.e.
+ * if use_qid is 1 then cpu_qid contains a fifteen bit queue identifier, and if
+ * use_qid is 0 then cpu_qid contains a fifteen bit CPU number). No entry is
+ * signified by RPS_NO_CPU_QID in val which is set to NO_QUEUE (0xffff). So the
+ * range of CPU numbers that can be stored is 0..32,767 (0x7fff) and the range
+ * of queue identifiers is 0..32,766. Note that CPU numbers are limited by
+ * CONFIG_NR_CPUS which currently has a maximum supported value of 8,192 (per
+ * arch/x86/Kconfig), so WARN_ON is used to check that a CPU number is less
+ * than 0x8000 when setting the cpu in rps_cpu_qid. The queue index is limited
+ * by configuration.
+ */
+struct rps_cpu_qid {
+ union {
+ u16 val;
+ struct {
+ u16 use_qid: 1;
+ union {
+ u16 cpu: 15;
+ u16 qid: 15;
+ };
+ };
+ };
+};
+
+#define RPS_NO_CPU_QID NO_QUEUE /* No CPU or qid in rps_cpu_qid */
+#define RPS_MAX_CPU 0x7fff /* Maximum cpu in rps_cpu_qid */
+#define RPS_MAX_QID 0x7ffe /* Maximum qid in rps_cpu_qid */
+
/*
* The rps_dev_flow structure contains the mapping of a flow to a CPU, the
* tail pointer for that CPU's input queue at the time of last enqueue, and
* a hardware filter index.
*/
struct rps_dev_flow {
- u16 cpu;
+ struct rps_cpu_qid cpu_qid;
u16 filter;
unsigned int last_qtail;
};
#define RPS_NO_FILTER 0xffff
+static inline void rps_dev_flow_clear(struct rps_dev_flow *dev_flow)
+{
+ dev_flow->cpu_qid.val = RPS_NO_CPU_QID;
+}
+
+static inline void rps_dev_flow_set_cpu(struct rps_dev_flow *dev_flow, u16 cpu)
+{
+ struct rps_cpu_qid cpu_qid;
+
+ if (WARN_ON(cpu > RPS_MAX_CPU))
+ return;
+
+ /* Set the rflow target to the CPU atomically */
+ cpu_qid.use_qid = 0;
+ cpu_qid.cpu = cpu;
+ dev_flow->cpu_qid = cpu_qid;
+}
+
/*
* The rps_dev_flow_table structure contains a table of flow mappings.
*/
@@ -697,34 +744,57 @@ struct rps_dev_flow_table {
#define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \
((_num) * sizeof(struct rps_dev_flow)))
+struct rps_sock_masks {
+ u32 mask;
+ u32 hash_mask;
+};
+
/*
- * The rps_sock_flow_table contains mappings of flows to the last CPU
- * on which they were processed by the application (set in recvmsg).
- * Each entry is a 32bit value. Upper part is the high-order bits
- * of flow hash, lower part is CPU number.
- * rps_cpu_mask is used to partition the space, depending on number of
- * possible CPUs : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1
- * For example, if 64 CPUs are possible, rps_cpu_mask = 0x3f,
- * meaning we use 32-6=26 bits for the hash.
+ * The rps_sock_flow_table contains mappings of flows to the last CPU on which
+ * they were processed by the application (set in recvmsg), or the mapping of
+ * the flow to a per thread queue for the application. Each entry is a 32bit
+ * value. The high order bit indicates whether a CPU number or a queue index is
+ * stored. The next high-order bits contain the flow hash, and the lower bits
+ * contain the CPU number or queue index. The sock_flow table contains two
+ * sets of masks, one for CPU entries (cpu_masks) and one for queue entries
+ * (queue_masks), that are to used partition the space between the hash bits
+ * and the CPU number or queue index. For the cpu masks, cpu_masks.mask is set
+ * to roundup_pow_of_two(nr_cpu_ids) - 1 and the corresponding hash mask,
+ * cpu_masks.hash_mask, is set to (~cpu_masks.mask & ~RPS_SOCK_FLOW_USE_QID).
+ * For example, if 64 CPUs are possible, cpu_masks.mask == 0x3f, meaning we use
+ * 31-6=25 bits for the hash (so cpu_masks.hash_mask == 0x7fffffc0). Similarly,
+ * queue_masks in rps_sock_flow_table is used to partition the space when a
+ * queue index is present.
*/
struct rps_sock_flow_table {
u32 mask;
+ struct rps_sock_masks cpu_masks;
+ struct rps_sock_masks queue_masks;
u32 ents[] ____cacheline_aligned_in_smp;
};
#define RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num]))
-#define RPS_NO_CPU 0xffff
+#define RPS_SOCK_FLOW_USE_QID (1 << 31)
+#define RPS_SOCK_FLOW_NO_IDENT -1U
-extern u32 rps_cpu_mask;
extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;
+extern unsigned int rps_max_num_queues;
+
+static inline void rps_init_sock_masks(struct rps_sock_masks *masks, u32 num)
+{
+ u32 mask = roundup_pow_of_two(num) - 1;
+
+ masks->mask = mask;
+ masks->hash_mask = (~mask & ~RPS_SOCK_FLOW_USE_QID);
+}
static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
u32 hash)
{
if (table && hash) {
+ u32 val = hash & table->cpu_masks.hash_mask;
unsigned int index = hash & table->mask;
- u32 val = hash & ~rps_cpu_mask;
/* We only give a hint, preemption can change CPU under us */
val |= raw_smp_processor_id();
@@ -4242,8 +4242,7 @@ static inline void ____napi_schedule(struct softnet_data *sd,
/* One global table that all flow-based protocols share. */
struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
EXPORT_SYMBOL(rps_sock_flow_table);
-u32 rps_cpu_mask __read_mostly;
-EXPORT_SYMBOL(rps_cpu_mask);
+unsigned int rps_max_num_queues;
struct static_key_false rps_needed __read_mostly;
EXPORT_SYMBOL(rps_needed);
@@ -4302,7 +4301,7 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
per_cpu(softnet_data, next_cpu).input_queue_head;
}
- rflow->cpu = next_cpu;
+ rps_dev_flow_set_cpu(rflow, next_cpu);
return rflow;
}
@@ -4349,22 +4348,39 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
sock_flow_table = rcu_dereference(rps_sock_flow_table);
if (flow_table && sock_flow_table) {
+ u32 next_cpu, comparator, ident;
struct rps_dev_flow *rflow;
- u32 next_cpu;
- u32 ident;
/* First check into global flow table if there is a match */
ident = sock_flow_table->ents[hash & sock_flow_table->mask];
- if ((ident ^ hash) & ~rps_cpu_mask)
- goto try_rps;
+ comparator = ((ident & RPS_SOCK_FLOW_USE_QID) ?
+ sock_flow_table->queue_masks.hash_mask :
+ sock_flow_table->cpu_masks.hash_mask);
- next_cpu = ident & rps_cpu_mask;
+ if ((ident ^ hash) & comparator)
+ goto try_rps;
/* OK, now we know there is a match,
* we can look at the local (per receive queue) flow table
*/
rflow = &flow_table->flows[hash & flow_table->mask];
- tcpu = rflow->cpu;
+
+ /* The flow_sock entry may refer to either a queue or a
+ * CPU. Proceed accordingly.
+ */
+ if (ident & RPS_SOCK_FLOW_USE_QID) {
+ /* A queue identifier is in the sock_flow_table entry */
+
+ /* Don't use aRFS to set CPU in this case, skip to
+ * trying RPS
+ */
+ goto try_rps;
+ }
+
+ /* A CPU number is in the sock_flow_table entry */
+
+ next_cpu = ident & sock_flow_table->cpu_masks.mask;
+ tcpu = rflow->cpu_qid.use_qid ? NO_QUEUE : rflow->cpu_qid.cpu;
/*
* If the desired CPU (where last recvmsg was done) is
@@ -4396,10 +4412,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
if (map) {
tcpu = map->cpus[reciprocal_scale(hash, map->len)];
- if (cpu_online(tcpu)) {
+ if (cpu_online(tcpu))
cpu = tcpu;
- goto done;
- }
}
done:
@@ -4424,17 +4438,18 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
{
struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
struct rps_dev_flow_table *flow_table;
+ struct rps_cpu_qid cpu_qid;
struct rps_dev_flow *rflow;
bool expire = true;
- unsigned int cpu;
rcu_read_lock();
flow_table = rcu_dereference(rxqueue->rps_flow_table);
if (flow_table && flow_id <= flow_table->mask) {
rflow = &flow_table->flows[flow_id];
- cpu = READ_ONCE(rflow->cpu);
- if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
- ((int)(per_cpu(softnet_data, cpu).input_queue_head -
+ cpu_qid = READ_ONCE(rflow->cpu_qid);
+ if (rflow->filter == filter_id && !cpu_qid.use_qid &&
+ cpu_qid.cpu < nr_cpu_ids &&
+ ((int)(per_cpu(softnet_data, cpu_qid.cpu).input_queue_head -
rflow->last_qtail) <
(int)(10 * flow_table->mask)))
expire = false;
@@ -858,7 +858,7 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
table->mask = mask;
for (count = 0; count <= mask; count++)
- table->flows[count].cpu = RPS_NO_CPU;
+ rps_dev_flow_clear(&table->flows[count]);
} else {
table = NULL;
}
@@ -65,12 +65,16 @@ static int rps_create_sock_flow_table(size_t size, size_t orig_size,
return -ENOMEM;
sock_table->mask = size - 1;
+ rps_init_sock_masks(&sock_table->cpu_masks,
+ nr_cpu_ids);
+ rps_init_sock_masks(&sock_table->queue_masks,
+ rps_max_num_queues);
} else {
sock_table = orig_table;
}
for (i = 0; i < size; i++)
- sock_table->ents[i] = RPS_NO_CPU;
+ sock_table->ents[i] = RPS_NO_CPU_QID;
} else {
sock_table = NULL;
}