@@ -709,6 +709,9 @@ struct lpfc_hba {
struct workqueue_struct *wq;
struct delayed_work eq_delay_work;
+#define LPFC_IDLE_STAT_DELAY 1000
+ struct delayed_work idle_stat_delay_work;
+
struct lpfc_sli sli;
uint8_t pci_dev_grp; /* lpfc PCI dev group: 0x0, 0x1, 0x2,... */
uint32_t sli_rev; /* SLI2, SLI3, or SLI4 */
@@ -1224,6 +1224,75 @@ lpfc_hb_mbox_cmpl(struct lpfc_hba * phba, LPFC_MBOXQ_t * pmboxq)
return;
}
+/**
+ * lpfc_idle_stat_delay_work - idle_stat tracking
+ *
+ * This routine tracks per-cq idle_stat and determines polling decisions.
+ *
+ * Return codes:
+ * None
+ **/
+static void
+lpfc_idle_stat_delay_work(struct work_struct *work)
+{
+ struct lpfc_hba *phba = container_of(to_delayed_work(work),
+ struct lpfc_hba,
+ idle_stat_delay_work);
+ struct lpfc_queue *cq;
+ struct lpfc_sli4_hdw_queue *hdwq;
+ struct lpfc_idle_stat *idle_stat;
+ u32 i, idle_percent;
+ u64 wall, wall_idle, diff_wall, diff_idle, busy_time;
+
+ if (phba->pport->load_flag & FC_UNLOADING)
+ return;
+
+ if (phba->link_state == LPFC_HBA_ERROR ||
+ phba->pport->fc_flag & FC_OFFLINE_MODE)
+ goto requeue;
+
+ for_each_present_cpu(i) {
+ hdwq = &phba->sli4_hba.hdwq[phba->sli4_hba.cpu_map[i].hdwq];
+ cq = hdwq->io_cq;
+
+ /* Skip if we've already handled this cq's primary CPU */
+ if (cq->chann != i)
+ continue;
+
+ idle_stat = &phba->sli4_hba.idle_stat[i];
+
+ /* get_cpu_idle_time returns values as running counters. Thus,
+ * to know the amount for this period, the prior counter values
+ * need to be subtracted from the current counter values.
+ * From there, the idle time stat can be calculated as a
+ * percentage of 100 - the sum of the other consumption times.
+ */
+ wall_idle = get_cpu_idle_time(i, &wall, 1);
+ diff_idle = wall_idle - idle_stat->prev_idle;
+ diff_wall = wall - idle_stat->prev_wall;
+
+ if (diff_wall <= diff_idle)
+ busy_time = 0;
+ else
+ busy_time = diff_wall - diff_idle;
+
+ idle_percent = div64_u64(100 * busy_time, diff_wall);
+ idle_percent = 100 - idle_percent;
+
+ if (idle_percent < 15)
+ cq->poll_mode = LPFC_QUEUE_WORK;
+ else
+ cq->poll_mode = LPFC_IRQ_POLL;
+
+ idle_stat->prev_idle = wall_idle;
+ idle_stat->prev_wall = wall;
+ }
+
+requeue:
+ schedule_delayed_work(&phba->idle_stat_delay_work,
+ msecs_to_jiffies(LPFC_IDLE_STAT_DELAY));
+}
+
static void
lpfc_hb_eq_delay_work(struct work_struct *work)
{
@@ -2924,6 +2993,7 @@ lpfc_stop_hba_timers(struct lpfc_hba *phba)
if (phba->pport)
lpfc_stop_vport_timers(phba->pport);
cancel_delayed_work_sync(&phba->eq_delay_work);
+ cancel_delayed_work_sync(&phba->idle_stat_delay_work);
del_timer_sync(&phba->sli.mbox_tmo);
del_timer_sync(&phba->fabric_block_timer);
del_timer_sync(&phba->eratt_poll);
@@ -6255,6 +6325,9 @@ lpfc_setup_driver_resource_phase1(struct lpfc_hba *phba)
INIT_DELAYED_WORK(&phba->eq_delay_work, lpfc_hb_eq_delay_work);
+ INIT_DELAYED_WORK(&phba->idle_stat_delay_work,
+ lpfc_idle_stat_delay_work);
+
return 0;
}
@@ -6934,13 +7007,23 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba)
goto out_free_hba_cpu_map;
}
+ phba->sli4_hba.idle_stat = kcalloc(phba->sli4_hba.num_possible_cpu,
+ sizeof(*phba->sli4_hba.idle_stat),
+ GFP_KERNEL);
+ if (!phba->sli4_hba.idle_stat) {
+ lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
+ "3390 Failed allocation for idle_stat\n");
+ rc = -ENOMEM;
+ goto out_free_hba_eq_info;
+ }
+
#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
phba->sli4_hba.c_stat = alloc_percpu(struct lpfc_hdwq_stat);
if (!phba->sli4_hba.c_stat) {
lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
"3332 Failed allocating per cpu hdwq stats\n");
rc = -ENOMEM;
- goto out_free_hba_eq_info;
+ goto out_free_hba_idle_stat;
}
#endif
@@ -6964,9 +7047,11 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba)
return 0;
#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
+out_free_hba_idle_stat:
+ kfree(phba->sli4_hba.idle_stat);
+#endif
out_free_hba_eq_info:
free_percpu(phba->sli4_hba.eq_info);
-#endif
out_free_hba_cpu_map:
kfree(phba->sli4_hba.cpu_map);
out_free_hba_eq_hdl:
@@ -7008,6 +7093,7 @@ lpfc_sli4_driver_resource_unset(struct lpfc_hba *phba)
#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
free_percpu(phba->sli4_hba.c_stat);
#endif
+ kfree(phba->sli4_hba.idle_stat);
/* Free memory allocated for msi-x interrupt vector to CPU mapping */
kfree(phba->sli4_hba.cpu_map);
@@ -7300,6 +7300,47 @@ lpfc_post_rq_buffer(struct lpfc_hba *phba, struct lpfc_queue *hrq,
return 1;
}
+/**
+ * lpfc_init_idle_stat_hb - Initialize idle_stat tracking
+ *
+ * This routine initializes the per-cq idle_stat to dynamically dictate
+ * polling decisions.
+ *
+ * Return codes:
+ * None
+ **/
+static void lpfc_init_idle_stat_hb(struct lpfc_hba *phba)
+{
+ int i;
+ struct lpfc_sli4_hdw_queue *hdwq;
+ struct lpfc_queue *cq;
+ struct lpfc_idle_stat *idle_stat;
+ u64 wall;
+
+ for_each_present_cpu(i) {
+ hdwq = &phba->sli4_hba.hdwq[phba->sli4_hba.cpu_map[i].hdwq];
+ cq = hdwq->io_cq;
+
+ /* Skip if we've already handled this cq's primary CPU */
+ if (cq->chann != i)
+ continue;
+
+ idle_stat = &phba->sli4_hba.idle_stat[i];
+
+ idle_stat->prev_idle = get_cpu_idle_time(i, &wall, 1);
+ idle_stat->prev_wall = wall;
+
+ if (phba->nvmet_support)
+ cq->poll_mode = LPFC_QUEUE_WORK;
+ else
+ cq->poll_mode = LPFC_IRQ_POLL;
+ }
+
+ if (!phba->nvmet_support)
+ schedule_delayed_work(&phba->idle_stat_delay_work,
+ msecs_to_jiffies(LPFC_IDLE_STAT_DELAY));
+}
+
static void lpfc_sli4_dip(struct lpfc_hba *phba)
{
uint32_t if_type;
@@ -7877,6 +7918,9 @@ lpfc_sli4_hba_setup(struct lpfc_hba *phba)
queue_delayed_work(phba->wq, &phba->eq_delay_work,
msecs_to_jiffies(LPFC_EQ_DELAY_MSECS));
+ /* start per phba idle_stat_delay heartbeat */
+ lpfc_init_idle_stat_hb(phba);
+
/* Start error attention (ERATT) polling timer */
mod_timer(&phba->eratt_poll,
jiffies + msecs_to_jiffies(1000 * phba->eratt_poll_interval));
@@ -13775,7 +13819,7 @@ lpfc_sli4_sp_handle_eqe(struct lpfc_hba *phba, struct lpfc_eqe *eqe,
if (!ret)
lpfc_printf_log(phba, KERN_ERR, LOG_SLI,
- "0390 Cannot schedule soft IRQ "
+ "0390 Cannot schedule queue work "
"for CQ eqcqid=%d, cqid=%d on CPU %d\n",
cqid, cq->queue_id, raw_smp_processor_id());
}
@@ -13786,6 +13830,7 @@ lpfc_sli4_sp_handle_eqe(struct lpfc_hba *phba, struct lpfc_eqe *eqe,
* @cq: Pointer to CQ to be processed
* @handler: Routine to process each cqe
* @delay: Pointer to usdelay to set in case of rescheduling of the handler
+ * @poll_mode: Polling mode we were called from
*
* This routine processes completion queue entries in a CQ. While a valid
* queue element is found, the handler is called. During processing checks
@@ -13803,7 +13848,8 @@ lpfc_sli4_sp_handle_eqe(struct lpfc_hba *phba, struct lpfc_eqe *eqe,
static bool
__lpfc_sli4_process_cq(struct lpfc_hba *phba, struct lpfc_queue *cq,
bool (*handler)(struct lpfc_hba *, struct lpfc_queue *,
- struct lpfc_cqe *), unsigned long *delay)
+ struct lpfc_cqe *), unsigned long *delay,
+ enum lpfc_poll_mode poll_mode)
{
struct lpfc_cqe *cqe;
bool workposted = false;
@@ -13844,6 +13890,10 @@ __lpfc_sli4_process_cq(struct lpfc_hba *phba, struct lpfc_queue *cq,
arm = false;
}
+ /* Note: complete the irq_poll softirq before rearming CQ */
+ if (poll_mode == LPFC_IRQ_POLL)
+ irq_poll_complete(&cq->iop);
+
/* Track the max number of CQEs processed in 1 EQ */
if (count > cq->CQ_max_cqe)
cq->CQ_max_cqe = count;
@@ -13893,17 +13943,17 @@ __lpfc_sli4_sp_process_cq(struct lpfc_queue *cq)
case LPFC_MCQ:
workposted |= __lpfc_sli4_process_cq(phba, cq,
lpfc_sli4_sp_handle_mcqe,
- &delay);
+ &delay, LPFC_QUEUE_WORK);
break;
case LPFC_WCQ:
if (cq->subtype == LPFC_IO)
workposted |= __lpfc_sli4_process_cq(phba, cq,
lpfc_sli4_fp_handle_cqe,
- &delay);
+ &delay, LPFC_QUEUE_WORK);
else
workposted |= __lpfc_sli4_process_cq(phba, cq,
lpfc_sli4_sp_handle_cqe,
- &delay);
+ &delay, LPFC_QUEUE_WORK);
break;
default:
lpfc_printf_log(phba, KERN_ERR, LOG_SLI,
@@ -13921,7 +13971,7 @@ __lpfc_sli4_sp_process_cq(struct lpfc_queue *cq)
&cq->sched_spwork, delay);
if (!ret)
lpfc_printf_log(phba, KERN_ERR, LOG_SLI,
- "0394 Cannot schedule soft IRQ "
+ "0394 Cannot schedule queue work "
"for cqid=%d on CPU %d\n",
cq->queue_id, cq->chann);
}
@@ -14252,6 +14302,44 @@ lpfc_sli4_fp_handle_cqe(struct lpfc_hba *phba, struct lpfc_queue *cq,
return workposted;
}
+/**
+ * lpfc_sli4_sched_cq_work - Schedules cq work
+ * @phba: Pointer to HBA context object.
+ * @cq: Pointer to CQ
+ * @cqid: CQ ID
+ *
+ * This routine checks the poll mode of the CQ corresponding to
+ * cq->chann, then either schedules a softirq or queue_work to complete
+ * cq work.
+ *
+ * queue_work path is taken if in NVMET mode, or if poll_mode is in
+ * LPFC_QUEUE_WORK mode. Otherwise, softirq path is taken.
+ *
+ **/
+static void lpfc_sli4_sched_cq_work(struct lpfc_hba *phba,
+ struct lpfc_queue *cq, uint16_t cqid)
+{
+ int ret = 0;
+
+ switch (cq->poll_mode) {
+ case LPFC_IRQ_POLL:
+ irq_poll_sched(&cq->iop);
+ break;
+ case LPFC_QUEUE_WORK:
+ default:
+ if (is_kdump_kernel())
+ ret = queue_work(phba->wq, &cq->irqwork);
+ else
+ ret = queue_work_on(cq->chann, phba->wq, &cq->irqwork);
+ if (!ret)
+ lpfc_printf_log(phba, KERN_ERR, LOG_SLI,
+ "0383 Cannot schedule queue work "
+ "for CQ eqcqid=%d, cqid=%d on CPU %d\n",
+ cqid, cq->queue_id,
+ raw_smp_processor_id());
+ }
+}
+
/**
* lpfc_sli4_hba_handle_eqe - Process a fast-path event queue entry
* @phba: Pointer to HBA context object.
@@ -14271,7 +14359,6 @@ lpfc_sli4_hba_handle_eqe(struct lpfc_hba *phba, struct lpfc_queue *eq,
struct lpfc_queue *cq = NULL;
uint32_t qidx = eq->hdwq;
uint16_t cqid, id;
- int ret = 0;
if (unlikely(bf_get_le32(lpfc_eqe_major_code, eqe) != 0)) {
lpfc_printf_log(phba, KERN_ERR, LOG_SLI,
@@ -14331,20 +14418,13 @@ lpfc_sli4_hba_handle_eqe(struct lpfc_hba *phba, struct lpfc_queue *eq,
else
cq->isr_timestamp = 0;
#endif
- if (is_kdump_kernel())
- ret = queue_work(phba->wq, &cq->irqwork);
- else
- ret = queue_work_on(cq->chann, phba->wq, &cq->irqwork);
- if (!ret)
- lpfc_printf_log(phba, KERN_ERR, LOG_SLI,
- "0363 Cannot schedule soft IRQ "
- "for CQ eqcqid=%d, cqid=%d on CPU %d\n",
- cqid, cq->queue_id, raw_smp_processor_id());
+ lpfc_sli4_sched_cq_work(phba, cq, cqid);
}
/**
* __lpfc_sli4_hba_process_cq - Process a fast-path event queue entry
* @cq: Pointer to CQ to be processed
+ * @poll_mode: Enum lpfc_poll_state to determine poll mode
*
* This routine calls the cq processing routine with the handler for
* fast path CQEs.
@@ -14358,7 +14438,8 @@ lpfc_sli4_hba_handle_eqe(struct lpfc_hba *phba, struct lpfc_queue *eq,
* the delay indicates when to reschedule it.
**/
static void
-__lpfc_sli4_hba_process_cq(struct lpfc_queue *cq)
+__lpfc_sli4_hba_process_cq(struct lpfc_queue *cq,
+ enum lpfc_poll_mode poll_mode)
{
struct lpfc_hba *phba = cq->phba;
unsigned long delay;
@@ -14367,7 +14448,7 @@ __lpfc_sli4_hba_process_cq(struct lpfc_queue *cq)
/* process and rearm the CQ */
workposted |= __lpfc_sli4_process_cq(phba, cq, lpfc_sli4_fp_handle_cqe,
- &delay);
+ &delay, poll_mode);
if (delay) {
if (is_kdump_kernel())
@@ -14378,9 +14459,9 @@ __lpfc_sli4_hba_process_cq(struct lpfc_queue *cq)
&cq->sched_irqwork, delay);
if (!ret)
lpfc_printf_log(phba, KERN_ERR, LOG_SLI,
- "0367 Cannot schedule soft IRQ "
- "for cqid=%d on CPU %d\n",
- cq->queue_id, cq->chann);
+ "0367 Cannot schedule queue work "
+ "for cqid=%d on CPU %d\n",
+ cq->queue_id, cq->chann);
}
/* wake up worker thread if there are works to be done */
@@ -14400,7 +14481,7 @@ lpfc_sli4_hba_process_cq(struct work_struct *work)
{
struct lpfc_queue *cq = container_of(work, struct lpfc_queue, irqwork);
- __lpfc_sli4_hba_process_cq(cq);
+ __lpfc_sli4_hba_process_cq(cq, LPFC_QUEUE_WORK);
}
/**
@@ -14415,7 +14496,7 @@ lpfc_sli4_dly_hba_process_cq(struct work_struct *work)
struct lpfc_queue *cq = container_of(to_delayed_work(work),
struct lpfc_queue, sched_irqwork);
- __lpfc_sli4_hba_process_cq(cq);
+ __lpfc_sli4_hba_process_cq(cq, LPFC_QUEUE_WORK);
}
/**
@@ -15090,6 +15171,15 @@ lpfc_eq_create(struct lpfc_hba *phba, struct lpfc_queue *eq, uint32_t imax)
return status;
}
+static int lpfc_cq_poll_hdler(struct irq_poll *iop, int budget)
+{
+ struct lpfc_queue *cq = container_of(iop, struct lpfc_queue, iop);
+
+ __lpfc_sli4_hba_process_cq(cq, LPFC_IRQ_POLL);
+
+ return 1;
+}
+
/**
* lpfc_cq_create - Create a Completion Queue on the HBA
* @phba: HBA structure that indicates port to create a queue on.
@@ -15229,6 +15319,8 @@ lpfc_cq_create(struct lpfc_hba *phba, struct lpfc_queue *cq,
if (cq->queue_id > phba->sli4_hba.cq_max)
phba->sli4_hba.cq_max = cq->queue_id;
+
+ irq_poll_init(&cq->iop, LPFC_IRQ_POLL_WEIGHT, lpfc_cq_poll_hdler);
out:
mempool_free(mbox, phba->mbox_mem_pool);
return status;
@@ -20,6 +20,9 @@
* included with this package. *
*******************************************************************/
+#include <linux/irq_poll.h>
+#include <linux/cpufreq.h>
+
#if defined(CONFIG_DEBUG_FS) && !defined(CONFIG_SCSI_LPFC_DEBUG_FS)
#define CONFIG_SCSI_LPFC_DEBUG_FS
#endif
@@ -135,6 +138,16 @@ struct lpfc_rqb {
struct rqb_dmabuf *);
};
+enum lpfc_poll_mode {
+ LPFC_QUEUE_WORK,
+ LPFC_IRQ_POLL
+};
+
+struct lpfc_idle_stat {
+ u64 prev_idle;
+ u64 prev_wall;
+};
+
struct lpfc_queue {
struct list_head list;
struct list_head wq_list;
@@ -265,6 +278,10 @@ struct lpfc_queue {
struct lpfc_queue *assoc_qp;
struct list_head _poll_list;
void **q_pgs; /* array to index entries per page */
+
+#define LPFC_IRQ_POLL_WEIGHT 256
+ struct irq_poll iop;
+ enum lpfc_poll_mode poll_mode;
};
struct lpfc_sli4_link {
@@ -926,6 +943,7 @@ struct lpfc_sli4_hba {
#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
struct lpfc_hdwq_stat __percpu *c_stat;
#endif
+ struct lpfc_idle_stat *idle_stat;
uint32_t conf_trunk;
#define lpfc_conf_trunk_port0_WORD conf_trunk
#define lpfc_conf_trunk_port0_SHIFT 0