diff mbox series

[v3,3/3] spi: spi-qcom-qspi: Add DMA mode support

Message ID 1681481153-24036-4-git-send-email-quic_vnivarth@quicinc.com
State New
Headers show
Series spi: Add DMA mode support to spi-qcom-qspi | expand

Commit Message

Vijaya Krishna Nivarthi April 14, 2023, 2:05 p.m. UTC
Current driver supports only PIO mode.

HW supports DMA, so add DMA mode support to the driver
for better performance for larger xfers.

Signed-off-by: Vijaya Krishna Nivarthi <quic_vnivarth@quicinc.com>
---
v2 -> v3:
- dropped Reported-by tag
- unset fragment bit only for last cmd_desc of last xfer

v1 -> v2:
- modified commit message
- addressed kernel test robot error
- correct placement of header file includes
- added more comments
- coding style related changes
- renamed variables
- used u32/u8 instead of uint32_t/8_t
- removed unnecessary casting
- retain same MSTR_CONFIG as PIO for DMA
---
 drivers/spi/spi-qcom-qspi.c | 434 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 403 insertions(+), 31 deletions(-)

Comments

Vijaya Krishna Nivarthi April 20, 2023, 1:11 p.m. UTC | #1
Hi,

Thank you very much for the review...

Uploaded v4 with below...


On 4/15/2023 3:35 AM, Doug Anderson wrote:
> Hi,
>
> On Fri, Apr 14, 2023 at 7:06 AM Vijaya Krishna Nivarthi
> <quic_vnivarth@quicinc.com> wrote:
>> @@ -14,7 +16,6 @@
>>   #include <linux/spi/spi.h>
>>   #include <linux/spi/spi-mem.h>
>>
>> -
> Drop unrelated whitespace change.


Done

>
>
>> @@ -108,6 +110,10 @@
>>   #define RD_FIFO_RESET          0x0030
>>   #define RESET_FIFO             BIT(0)
>>
>> +#define NEXT_DMA_DESC_ADDR             0x0040
>> +#define CURRENT_DMA_DESC_ADDR  0x0044
>> +#define CURRENT_MEM_ADDR               0x0048
> Looking at the above with a correctly configured editor (tab size=8)
> the numbers don't line up. The first and 3rd have one too many tabs.
>

Corrected

>> @@ -120,6 +126,27 @@ enum qspi_dir {
>>          QSPI_WRITE,
>>   };
>>
>> +struct qspi_cmd_desc {
>> +       u32 data_address;
>> +       u32 next_descriptor;
>> +       u32 direction:1;
>> +       u32 multi_io_mode:3;
>> +       u32 reserved1:4;
>> +       u32 fragment:1;
>> +       u32 reserved2:7;
>> +       u32 length:16;
>> +       /*
>> +        * This marks end of HW cmd descriptor
>> +        * Fields down below are for SW usage to
>> +        * copy data from DMA buffer to rx buffer
>> +        */
>> +       u8 *bounce_src;
>> +       u8 *bounce_dst;
>> +       u32 bounce_length;
>> +};
>> +
>> +#define QSPI_MAX_NUM_DESC 5
> Nothing uses QSPI_MAX_NUM_DESC. Drop it.


Done

>
>
>> @@ -137,11 +164,36 @@ enum qspi_clocks {
>>          QSPI_NUM_CLKS
>>   };
>>
>> +enum qspi_xfer_mode {
>> +       QSPI_INVALID,
>> +       QSPI_FIFO,
>> +       QSPI_DMA
>> +};
> Why bother with INVALID? It's either FIFO or DMA, right?


Dropped

>
>
>> +/* number of entries in sgt returned from spi framework that we can support */
>> +#define QSPI_MAX_SG 5
> Is the above a hardware limitation, or just because you are statically
> allocating arrays? Please clarify in the comment. Would it make sense
> to just dynamically allocate the arrays and remove the need for this
> limitation?


Added comment.

Dynamic allocation may not be required here as we dont expect too many 
cmd descriptors to be necessary.

>
>> +/* 3 descriptors for head, aligned part and tail */
>> +#define QSPI_NUM_CMD_DESC 3
>> +
>> +/* 2 descriptors for head, tail */
>> +#define QSPI_NUM_DAT_DESC 2
>> +
>>   struct qcom_qspi {
>>          void __iomem *base;
>>          struct device *dev;
>>          struct clk_bulk_data *clks;
>>          struct qspi_xfer xfer;
>> +       struct dma_pool *dma_cmd_pool;
>> +       struct dma_pool *dma_data_pool;
>> +       dma_addr_t dma_cmd_desc[QSPI_NUM_CMD_DESC*QSPI_MAX_SG];
>> +       dma_addr_t dma_data_desc[QSPI_NUM_DAT_DESC*QSPI_MAX_SG];
>> +       void *virt_cmd_desc[QSPI_NUM_CMD_DESC*QSPI_MAX_SG];
>> +       void *virt_data_desc[QSPI_NUM_DAT_DESC*QSPI_MAX_SG];
>> +       unsigned int n_cmd_desc;
>> +       unsigned int n_data_desc;
>> +       int xfer_mode;
> Instead of "int", shouldn't xfer_mode be "enum qspi_xfer_mode"?
> Although below I'm proposing that xfer_mode can just be completely
> dropped from this structure.
>

Done

>> @@ -151,18 +203,25 @@ struct qcom_qspi {
>>   static u32 qspi_buswidth_to_iomode(struct qcom_qspi *ctrl,
>>                                     unsigned int buswidth)
>>   {
>> +       u32 ret;
>> +
>> +       /* for DMA we don't write to PIO_XFER_CFG register, so don't shift */
>>          switch (buswidth) {
>>          case 1:
>> -               return SDR_1BIT << MULTI_IO_MODE_SHFT;
>> +               ret = (ctrl->xfer_mode == QSPI_DMA ? SDR_1BIT : SDR_1BIT << MULTI_IO_MODE_SHFT);
>> +               break;
>>          case 2:
>> -               return SDR_2BIT << MULTI_IO_MODE_SHFT;
>> +               ret = (ctrl->xfer_mode == QSPI_DMA ? SDR_2BIT : SDR_2BIT << MULTI_IO_MODE_SHFT);
>> +               break;
>>          case 4:
>> -               return SDR_4BIT << MULTI_IO_MODE_SHFT;
>> +               ret = (ctrl->xfer_mode == QSPI_DMA ? SDR_4BIT : SDR_4BIT << MULTI_IO_MODE_SHFT);
>> +               break;
>>          default:
>>                  dev_warn_once(ctrl->dev,
>>                                  "Unexpected bus width: %u\n", buswidth);
>> -               return SDR_1BIT << MULTI_IO_MODE_SHFT;
>> +               ret = (ctrl->xfer_mode == QSPI_DMA ? SDR_1BIT : SDR_1BIT << MULTI_IO_MODE_SHFT);
>>          }
>> +       return ret;
> Wouldn't it be easier to do the test once at the end? In other words,
> in the switch statement "ret" never contains the shift and then at the
> end:
>
> if (ctrl->xfer_mode != QSPI_DMA)
>    ret <<= MULTI_IO_MODE_SHFT;
> return ret;
>
> ...or, even better, just always return the unshifted mode and do the
> shifting unconditionally in qcom_qspi_pio_xfer_cfg(). Then you never
> need to look at xfer_mode to decide.
>

Agree, Done

>> @@ -241,12 +316,16 @@ static int qcom_qspi_set_speed(struct qcom_qspi *ctrl, unsigned long speed_hz)
>>                  return ret;
>>          }
>>
>> +       avg_bw_cpu = Bps_to_icc(speed_hz);
>>          /*
>> -        * Set BW quota for CPU as driver supports FIFO mode only.
>> -        * We don't have explicit peak requirement so keep it equal to avg_bw.
>> +        * Set BW quota for CPU for FIFO to avg_bw
>> +        * as we don't have explicit peak requirement.
>> +        * TBD TBD TBD - change this as required for DMA.
>> +        * As of now same peak requirement seems to be working.
>>           */
>> -       avg_bw_cpu = Bps_to_icc(speed_hz);
>> -       ret = icc_set_bw(ctrl->icc_path_cpu_to_qspi, avg_bw_cpu, avg_bw_cpu);
>> +       peak_bw_cpu = ctrl->xfer_mode == QSPI_FIFO ? avg_bw_cpu : avg_bw_cpu;
>> +
>> +       ret = icc_set_bw(ctrl->icc_path_cpu_to_qspi, avg_bw_cpu, peak_bw_cpu);
> The change to this function is completely a no-op, right? You check
> the mode against QSPI_FIFO but you set the "peak_bw_cpu" to the same
> thing in both modes. ...and the thing you set it to is exactly the
> same as the function set it to before your patch.
>
> ...so drop all the changes you made to this function.
>

Dropped

>> @@ -258,6 +337,190 @@ static int qcom_qspi_set_speed(struct qcom_qspi *ctrl, unsigned long speed_hz)
>>          return 0;
>>   }
>>
>> +/* aligned to 32 bytes, not to cross 1KB boundary */
>> +#define QSPI_ALIGN_REQ         32
>> +#define QSPI_BOUNDARY_REQ      1024
>> +
>> +static int qcom_qspi_alloc_desc(struct qcom_qspi *ctrl, uint8_t *virt_ptr,
>> +                       dma_addr_t dma_ptr, uint32_t n_bytes)
> Why is "n_bytes" "uint32_t" instead of just "u32"? Please just use
> "u32" consistently in this file.
>

Done

>
>> +               }
>> +       }
>> +       return 0;
>> +
>> +cleanup:
>> +       dev_err(ctrl->dev, "ERROR cleanup in setup_dma_desc\n");
>> +       for (i = 0; i < ctrl->n_data_desc; i++)
>> +               dma_pool_free(ctrl->dma_data_pool, ctrl->virt_data_desc[i],
>> +                                 ctrl->dma_data_desc[i]);
>> +       ctrl->n_data_desc = 0;
>> +
>> +       for (i = 0; i < ctrl->n_cmd_desc; i++)
>> +               dma_pool_free(ctrl->dma_cmd_pool, ctrl->virt_cmd_desc[i],
>> +                                 ctrl->dma_cmd_desc[i]);
>> +       ctrl->n_cmd_desc = 0;
>> +       return ret;
>> +}
>> +
>> +static void qcom_qspi_dma_xfer(struct qcom_qspi *ctrl)
>> +{
>> +       /* Ack any previous interrupts that might be hanging around */
>> +       writel(DMA_CHAIN_DONE, ctrl->base + MSTR_INT_STATUS);
> Do we really need the above? Maybe we can drop it and (in a separate
> patch) drop the similar statement in qcom_qspi_pio_xfer()?
>
> If this is truly needed for some reason, then it seems like in both
> cases you should ack _all_ interrupts (the FIFO plus the DMA ones)
> since we might be switching back and forth between the two modes and
> thus any old interrupts that are "hanging around" could be from
> either. ...but I think you can just drop it. If there are really any
> interrupts "hanging around" we're in pretty bad shape.
>

Dropped

>> +       /* Setup new interrupts */
>> +       writel(DMA_CHAIN_DONE, ctrl->base + MSTR_INT_EN);
>> +
>> +       /* flush all writes */
>> +       wmb();
> Why do you need this explicit wmb()? I'm fairly sure that this is
> handled automatically by the fact that you're using writel() and not
> writel_relaxed(). writel() documents that it is "ordered relative to
> any prior Normal memory access" and certainly it's ordered relative to
> IO writes to the same device.
>

Dropped

>> +
>> +       /* kick off transfer */
>> +       writel((uint32_t)(uintptr_t)(ctrl->dma_cmd_desc)[0], ctrl->base + NEXT_DMA_DESC_ADDR);
> It feels like there's one too many casts here. Shouldn't this just be
> "(u32)(ctrl->dma_cmd_desc[0])"?


Changed

>
>> +}
>> +
>> +/* Switch to DMA if transfer length exceeds this */
>> +#define QSPI_MAX_BYTES_FIFO 64
>> +
>> +static bool qcom_qspi_can_dma(struct spi_controller *ctlr,
>> +                        struct spi_device *slv, struct spi_transfer *xfer)
>> +{
>> +       return xfer->len > QSPI_MAX_BYTES_FIFO ? true : false;
> No need for the "? true : false". Just:
>
> return xfer->len > QSPI_MAX_BYTES_FIFO;
>

Correct. Done.

>> @@ -290,7 +555,25 @@ static int qcom_qspi_transfer_one(struct spi_master *master,
>>          ctrl->xfer.is_last = list_is_last(&xfer->transfer_list,
>>                                            &master->cur_msg->transfers);
>>          ctrl->xfer.rem_bytes = xfer->len;
>> -       qcom_qspi_pio_xfer(ctrl);
>> +
>> +       if (qcom_qspi_can_dma(master, slv, xfer)) {
> Maybe it would be better to check if either "xfer->rx_sg.nents" or
> "xfer->tx_sg.nents" is non-zero. That indicates that the SPI framework
> is expecting you to do DMA.
>

Yes, changed.

>> +               ctrl->xfer_mode = QSPI_DMA;
>> +               ctrl->iomode = qspi_buswidth_to_iomode(ctrl, ctrl->xfer.buswidth);
> Don't store iomode in the "ctrl" structure (remove it from that
> struct). Just make it a local variable in qcom_qspi_setup_dma_desc()
> and then pass it in to the one place that needs it:
> qcom_qspi_alloc_desc()
>

Done

>> +               mstr_cfg |= DMA_ENABLE;
>> +               writel(mstr_cfg, ctrl->base + MSTR_CONFIG);
> nit: I seem to remember IO writes to the controller taking a
> non-trivial amount of time. Maybe worth it to do?
>
> if (!(mstr_cfg & DMA_ENABLE)) {
>    mstr_cfg |= DMA_ENABLE;
>    writel(mstr_cfg, ctrl->base + MSTR_CONFIG);
> }
>
> ...similar for the "else" case below.
>

Done

>> @@ -328,6 +611,40 @@ static int qcom_qspi_prepare_message(struct spi_master *master,
>>          return 0;
>>   }
>>
>> +static int qcom_qspi_alloc_dma(struct qcom_qspi *ctrl)
>> +{
>> +       /* allocate for cmd descriptors pool */
>> +       ctrl->dma_cmd_pool = dma_pool_create("qspi cmd desc pool",
>> +               ctrl->dev, sizeof(struct qspi_cmd_desc), 0, 0);
> Instead of dma_pool_create(), use dmam_pool_create(). That looks to be
> the (oddly named) devm version of the function. Then you can fully get
> rid of qcom_qspi_free_dma() and also the dma_pool_destroy() in your
> error handling below.
>
> It also seems really weird that the "data" pool has such strict
> alignment requirements and you do a whole ton of work to meet those
> requirements, but the "cmd" pool has no alignment requirements at all.
> Is this really correct?
>

Used dmam version.

Yes only the data addresses need to be aligned; there is no constraint 
for cmd descriptors.

>> +       if (!ctrl->dma_cmd_pool) {
>> +               dev_err(ctrl->dev, "Could not create dma pool for cmd_desc\n");
> nit: no need for an error message here. You can assume that allocation
> failures will already print a warning splat and you don't need another
> one for this incredibly unlikely event.
>

Dropped

>> @@ -426,27 +743,63 @@ static irqreturn_t qcom_qspi_irq(int irq, void *dev_id)
>>          int_status = readl(ctrl->base + MSTR_INT_STATUS);
>>          writel(int_status, ctrl->base + MSTR_INT_STATUS);
>>
>> -       if (ctrl->xfer.dir == QSPI_WRITE) {
>> -               if (int_status & WR_FIFO_EMPTY)
>> -                       ret = pio_write(ctrl);
>> -       } else {
>> -               if (int_status & RESP_FIFO_RDY)
>> -                       ret = pio_read(ctrl);
>> -       }
>> -
>> -       if (int_status & QSPI_ERR_IRQS) {
>> -               if (int_status & RESP_FIFO_UNDERRUN)
>> -                       dev_err(ctrl->dev, "IRQ error: FIFO underrun\n");
>> -               if (int_status & WR_FIFO_OVERRUN)
>> -                       dev_err(ctrl->dev, "IRQ error: FIFO overrun\n");
>> -               if (int_status & HRESP_FROM_NOC_ERR)
>> -                       dev_err(ctrl->dev, "IRQ error: NOC response error\n");
>> -               ret = IRQ_HANDLED;
>> -       }
>> -
>> -       if (!ctrl->xfer.rem_bytes) {
>> -               writel(0, ctrl->base + MSTR_INT_EN);
>> -               spi_finalize_current_transfer(dev_get_drvdata(ctrl->dev));
>> +       switch (ctrl->xfer_mode) {
>> +       case QSPI_FIFO:
> I don't think you really need to check xfer_mode here, do you? If
> xfer_mode is FIFO then only the FIFO-related interrupts are enabled.
> If xfer_mode is DMA then only the DMA-related interrupts are enabled.
>
> In fact, I think you can fully get rid of the "xfer_mode" structure
> member completely. It's completely redundant.
>

I think the xfer_mode is required to conditionally cleanup in 
handle_err() and isr()

I have retained it for now; it also seems to help keep isr() more readable.

>> @@ -487,6 +840,9 @@ static int qcom_qspi_probe(struct platform_device *pdev)
>>          if (ret)
>>                  return ret;
>>
>> +       /* set default mode to FIFO */
>> +       ctrl->xfer_mode = QSPI_FIFO;
> Get rid of this initialization. Above I'm suggesting getting rid of
> "xfter" mode altogether, but in any case, we should be setting this
> before each transfer so the initialization doesn't do anything useful.
>

Done

>> @@ -556,10 +923,15 @@ static int qcom_qspi_probe(struct platform_device *pdev)
>>   static void qcom_qspi_remove(struct platform_device *pdev)
>>   {
>>          struct spi_master *master = platform_get_drvdata(pdev);
>> +       struct qcom_qspi *ctrl;
>> +
>> +       ctrl = spi_master_get_devdata(master);
>>
>>          /* Unregister _before_ disabling pm_runtime() so we stop transfers */
>>          spi_unregister_master(master);
>>
>> +       qcom_qspi_free_dma(ctrl);
>> +
>>          pm_runtime_disable(&pdev->dev);
> To make this the reverse order of probe the qcom_qspi_free_dma() call
> should be _after_ the pm_runtime_disable(), although above I'm
> suggesting fully getting rid of qcom_qspi_free_dma() so maybe the
> point is moot.
diff mbox series

Patch

diff --git a/drivers/spi/spi-qcom-qspi.c b/drivers/spi/spi-qcom-qspi.c
index fab1553..cfd92a3 100644
--- a/drivers/spi/spi-qcom-qspi.c
+++ b/drivers/spi/spi-qcom-qspi.c
@@ -2,6 +2,8 @@ 
 // Copyright (c) 2017-2018, The Linux foundation. All rights reserved.
 
 #include <linux/clk.h>
+#include <linux/dmapool.h>
+#include <linux/dma-mapping.h>
 #include <linux/interconnect.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
@@ -14,7 +16,6 @@ 
 #include <linux/spi/spi.h>
 #include <linux/spi/spi-mem.h>
 
-
 #define QSPI_NUM_CS		2
 #define QSPI_BYTES_PER_WORD	4
 
@@ -62,6 +63,7 @@ 
 #define WR_FIFO_FULL		BIT(10)
 #define WR_FIFO_OVERRUN		BIT(11)
 #define TRANSACTION_DONE	BIT(16)
+#define DMA_CHAIN_DONE		BIT(31)
 #define QSPI_ERR_IRQS		(RESP_FIFO_UNDERRUN | HRESP_FROM_NOC_ERR | \
 				 WR_FIFO_OVERRUN)
 #define QSPI_ALL_IRQS		(QSPI_ERR_IRQS | RESP_FIFO_RDY | \
@@ -108,6 +110,10 @@ 
 #define RD_FIFO_RESET		0x0030
 #define RESET_FIFO		BIT(0)
 
+#define NEXT_DMA_DESC_ADDR		0x0040
+#define CURRENT_DMA_DESC_ADDR	0x0044
+#define CURRENT_MEM_ADDR		0x0048
+
 #define CUR_MEM_ADDR		0x0048
 #define HW_VERSION		0x004c
 #define RD_FIFO			0x0050
@@ -120,6 +126,27 @@  enum qspi_dir {
 	QSPI_WRITE,
 };
 
+struct qspi_cmd_desc {
+	u32 data_address;
+	u32 next_descriptor;
+	u32 direction:1;
+	u32 multi_io_mode:3;
+	u32 reserved1:4;
+	u32 fragment:1;
+	u32 reserved2:7;
+	u32 length:16;
+	/*
+	 * This marks end of HW cmd descriptor
+	 * Fields down below are for SW usage to
+	 * copy data from DMA buffer to rx buffer
+	 */
+	u8 *bounce_src;
+	u8 *bounce_dst;
+	u32 bounce_length;
+};
+
+#define QSPI_MAX_NUM_DESC 5
+
 struct qspi_xfer {
 	union {
 		const void *tx_buf;
@@ -137,11 +164,36 @@  enum qspi_clocks {
 	QSPI_NUM_CLKS
 };
 
+enum qspi_xfer_mode {
+	QSPI_INVALID,
+	QSPI_FIFO,
+	QSPI_DMA
+};
+
+/* number of entries in sgt returned from spi framework that we can support */
+#define QSPI_MAX_SG 5
+
+/* 3 descriptors for head, aligned part and tail */
+#define QSPI_NUM_CMD_DESC 3
+
+/* 2 descriptors for head, tail */
+#define QSPI_NUM_DAT_DESC 2
+
 struct qcom_qspi {
 	void __iomem *base;
 	struct device *dev;
 	struct clk_bulk_data *clks;
 	struct qspi_xfer xfer;
+	struct dma_pool *dma_cmd_pool;
+	struct dma_pool *dma_data_pool;
+	dma_addr_t dma_cmd_desc[QSPI_NUM_CMD_DESC*QSPI_MAX_SG];
+	dma_addr_t dma_data_desc[QSPI_NUM_DAT_DESC*QSPI_MAX_SG];
+	void *virt_cmd_desc[QSPI_NUM_CMD_DESC*QSPI_MAX_SG];
+	void *virt_data_desc[QSPI_NUM_DAT_DESC*QSPI_MAX_SG];
+	unsigned int n_cmd_desc;
+	unsigned int n_data_desc;
+	int xfer_mode;
+	u32 iomode;
 	struct icc_path *icc_path_cpu_to_qspi;
 	unsigned long last_speed;
 	/* Lock to protect data accessed by IRQs */
@@ -151,18 +203,25 @@  struct qcom_qspi {
 static u32 qspi_buswidth_to_iomode(struct qcom_qspi *ctrl,
 				   unsigned int buswidth)
 {
+	u32 ret;
+
+	/* for DMA we don't write to PIO_XFER_CFG register, so don't shift */
 	switch (buswidth) {
 	case 1:
-		return SDR_1BIT << MULTI_IO_MODE_SHFT;
+		ret = (ctrl->xfer_mode == QSPI_DMA ? SDR_1BIT : SDR_1BIT << MULTI_IO_MODE_SHFT);
+		break;
 	case 2:
-		return SDR_2BIT << MULTI_IO_MODE_SHFT;
+		ret = (ctrl->xfer_mode == QSPI_DMA ? SDR_2BIT : SDR_2BIT << MULTI_IO_MODE_SHFT);
+		break;
 	case 4:
-		return SDR_4BIT << MULTI_IO_MODE_SHFT;
+		ret = (ctrl->xfer_mode == QSPI_DMA ? SDR_4BIT : SDR_4BIT << MULTI_IO_MODE_SHFT);
+		break;
 	default:
 		dev_warn_once(ctrl->dev,
 				"Unexpected bus width: %u\n", buswidth);
-		return SDR_1BIT << MULTI_IO_MODE_SHFT;
+		ret = (ctrl->xfer_mode == QSPI_DMA ? SDR_1BIT : SDR_1BIT << MULTI_IO_MODE_SHFT);
 	}
+	return ret;
 }
 
 static void qcom_qspi_pio_xfer_cfg(struct qcom_qspi *ctrl)
@@ -223,6 +282,21 @@  static void qcom_qspi_handle_err(struct spi_master *master,
 	spin_lock_irqsave(&ctrl->lock, flags);
 	writel(0, ctrl->base + MSTR_INT_EN);
 	ctrl->xfer.rem_bytes = 0;
+
+	if (ctrl->xfer_mode == QSPI_DMA) {
+		int i;
+
+		/* free cmd and data descriptors */
+		for (i = 0; i < ctrl->n_data_desc; i++)
+			dma_pool_free(ctrl->dma_data_pool, ctrl->virt_data_desc[i],
+					  ctrl->dma_data_desc[i]);
+		ctrl->n_data_desc = 0;
+
+		for (i = 0; i < ctrl->n_cmd_desc; i++)
+			dma_pool_free(ctrl->dma_cmd_pool, ctrl->virt_cmd_desc[i],
+					  ctrl->dma_cmd_desc[i]);
+		ctrl->n_cmd_desc = 0;
+	}
 	spin_unlock_irqrestore(&ctrl->lock, flags);
 }
 
@@ -230,6 +304,7 @@  static int qcom_qspi_set_speed(struct qcom_qspi *ctrl, unsigned long speed_hz)
 {
 	int ret;
 	unsigned int avg_bw_cpu;
+	unsigned int peak_bw_cpu;
 
 	if (speed_hz == ctrl->last_speed)
 		return 0;
@@ -241,12 +316,16 @@  static int qcom_qspi_set_speed(struct qcom_qspi *ctrl, unsigned long speed_hz)
 		return ret;
 	}
 
+	avg_bw_cpu = Bps_to_icc(speed_hz);
 	/*
-	 * Set BW quota for CPU as driver supports FIFO mode only.
-	 * We don't have explicit peak requirement so keep it equal to avg_bw.
+	 * Set BW quota for CPU for FIFO to avg_bw
+	 * as we don't have explicit peak requirement.
+	 * TBD TBD TBD - change this as required for DMA.
+	 * As of now same peak requirement seems to be working.
 	 */
-	avg_bw_cpu = Bps_to_icc(speed_hz);
-	ret = icc_set_bw(ctrl->icc_path_cpu_to_qspi, avg_bw_cpu, avg_bw_cpu);
+	peak_bw_cpu = ctrl->xfer_mode == QSPI_FIFO ? avg_bw_cpu : avg_bw_cpu;
+
+	ret = icc_set_bw(ctrl->icc_path_cpu_to_qspi, avg_bw_cpu, peak_bw_cpu);
 	if (ret) {
 		dev_err(ctrl->dev, "%s: ICC BW voting failed for cpu: %d\n",
 			__func__, ret);
@@ -258,6 +337,190 @@  static int qcom_qspi_set_speed(struct qcom_qspi *ctrl, unsigned long speed_hz)
 	return 0;
 }
 
+/* aligned to 32 bytes, not to cross 1KB boundary */
+#define QSPI_ALIGN_REQ		32
+#define QSPI_BOUNDARY_REQ	1024
+
+static int qcom_qspi_alloc_desc(struct qcom_qspi *ctrl, uint8_t *virt_ptr,
+			dma_addr_t dma_ptr, uint32_t n_bytes)
+{
+	struct qspi_cmd_desc *virt_cmd_desc, *prev;
+	uint8_t *virt_data_desc;
+	dma_addr_t dma_cmd_desc, dma_data_desc;
+
+	if (virt_ptr && n_bytes >= QSPI_ALIGN_REQ) {
+		dev_err(ctrl->dev,
+			"Exiting to avert memory overwrite, n_bytes-%d\n", n_bytes);
+		return -ENOMEM;
+	}
+
+	/* allocate for dma cmd descriptor */
+	virt_cmd_desc = (struct qspi_cmd_desc *)dma_pool_alloc(ctrl->dma_cmd_pool,
+		GFP_KERNEL, &dma_cmd_desc);
+	if (!virt_cmd_desc) {
+		dev_err(ctrl->dev,
+			"Could not allocate for cmd_desc\n");
+		return -ENOMEM;
+	}
+	ctrl->virt_cmd_desc[ctrl->n_cmd_desc] = virt_cmd_desc;
+	ctrl->dma_cmd_desc[ctrl->n_cmd_desc] = dma_cmd_desc;
+	ctrl->n_cmd_desc++;
+
+	/* allocate for dma data descriptor if unaligned else use pre-aligned */
+	if (virt_ptr) {
+		virt_data_desc = (uint8_t *)dma_pool_zalloc(ctrl->dma_data_pool,
+			GFP_KERNEL, &dma_data_desc);
+		if (!virt_data_desc) {
+			dev_err(ctrl->dev,
+				"Could not allocate for data_desc\n");
+			return -ENOMEM;
+		}
+		ctrl->virt_data_desc[ctrl->n_data_desc] = virt_data_desc;
+		ctrl->dma_data_desc[ctrl->n_data_desc] = dma_data_desc;
+		ctrl->n_data_desc++;
+
+		/*
+		 * for tx copy xfer data into allocated buffer
+		 * for rx setup bounce info to copy after xfer
+		 */
+		if (ctrl->xfer.dir == QSPI_WRITE) {
+			memcpy(virt_data_desc, virt_ptr, n_bytes);
+		} else {
+			virt_cmd_desc->bounce_src = virt_data_desc;
+			virt_cmd_desc->bounce_dst = virt_ptr;
+			virt_cmd_desc->bounce_length = n_bytes;
+		}
+	} else {
+		dma_data_desc = dma_ptr;
+	}
+
+	/* setup cmd descriptor */
+	virt_cmd_desc->data_address = dma_data_desc;
+	virt_cmd_desc->next_descriptor = 0;
+	virt_cmd_desc->direction = ctrl->xfer.dir;
+	virt_cmd_desc->multi_io_mode = ctrl->iomode;
+	virt_cmd_desc->reserved1 = 0;
+	virt_cmd_desc->fragment = ctrl->xfer.is_last ? 0 : 1;
+	virt_cmd_desc->reserved2 = 0;
+	virt_cmd_desc->length = n_bytes;
+
+	/* update previous descriptor */
+	if (ctrl->n_cmd_desc >= 2) {
+		prev = (ctrl->virt_cmd_desc)[ctrl->n_cmd_desc - 2];
+		prev->next_descriptor = dma_cmd_desc;
+		prev->fragment = 1;
+	}
+
+	return 0;
+}
+
+static int qcom_qspi_setup_dma_desc(struct qcom_qspi *ctrl,
+				struct spi_transfer *xfer)
+{
+	int ret;
+	struct sg_table *sgt;
+	unsigned int sg_total_len = 0;
+	dma_addr_t dma_ptr_sg;
+	unsigned int dma_len_sg;
+	uint32_t prolog_bytes, aligned_bytes, epilog_bytes;
+	dma_addr_t aligned_addr;
+	int i;
+	uint8_t *byte_ptr;
+
+	if (ctrl->n_cmd_desc || ctrl->n_data_desc) {
+		dev_err(ctrl->dev, "Remnant dma buffers cmd-%d, data-%d\n",
+			ctrl->n_cmd_desc, ctrl->n_data_desc);
+		return -EIO;
+	}
+
+	sgt = (ctrl->xfer.dir == QSPI_READ) ? &xfer->rx_sg : &xfer->tx_sg;
+	if (!sgt->nents || sgt->nents > QSPI_MAX_SG) {
+		dev_err(ctrl->dev, "Cannot handle %d entries in scatter list\n", sgt->nents);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < sgt->nents; i++)
+		sg_total_len += sg_dma_len(sgt->sgl + i);
+	if (sg_total_len != xfer->len) {
+		dev_err(ctrl->dev, "Data lengths mismatch\n");
+		return -EINVAL;
+	}
+
+	if (ctrl->xfer.dir == QSPI_READ)
+		byte_ptr = xfer->rx_buf;
+	else
+		byte_ptr = (uint8_t *)xfer->tx_buf;
+
+	for (i = 0; i < sgt->nents; i++) {
+		dma_ptr_sg = sg_dma_address(sgt->sgl + i);
+		dma_len_sg = sg_dma_len(sgt->sgl + i);
+
+		aligned_addr = PTR_ALIGN(dma_ptr_sg, QSPI_ALIGN_REQ);
+
+		prolog_bytes = min(dma_len_sg, (unsigned int)(aligned_addr - dma_ptr_sg));
+		if (prolog_bytes) {
+			ret = qcom_qspi_alloc_desc(ctrl, byte_ptr, 0, prolog_bytes);
+			if (ret)
+				goto cleanup;
+			byte_ptr += prolog_bytes;
+		}
+
+		aligned_bytes = PTR_ALIGN_DOWN(dma_len_sg - prolog_bytes, QSPI_ALIGN_REQ);
+		if (aligned_bytes) {
+			ret = qcom_qspi_alloc_desc(ctrl, 0, aligned_addr, aligned_bytes);
+			if (ret)
+				goto cleanup;
+			byte_ptr += aligned_bytes;
+		}
+
+		epilog_bytes = dma_len_sg - prolog_bytes - aligned_bytes;
+		if (epilog_bytes) {
+			ret = qcom_qspi_alloc_desc(ctrl, byte_ptr, 0, epilog_bytes);
+			if (ret)
+				goto cleanup;
+			byte_ptr += epilog_bytes;
+		}
+	}
+	return 0;
+
+cleanup:
+	dev_err(ctrl->dev, "ERROR cleanup in setup_dma_desc\n");
+	for (i = 0; i < ctrl->n_data_desc; i++)
+		dma_pool_free(ctrl->dma_data_pool, ctrl->virt_data_desc[i],
+				  ctrl->dma_data_desc[i]);
+	ctrl->n_data_desc = 0;
+
+	for (i = 0; i < ctrl->n_cmd_desc; i++)
+		dma_pool_free(ctrl->dma_cmd_pool, ctrl->virt_cmd_desc[i],
+				  ctrl->dma_cmd_desc[i]);
+	ctrl->n_cmd_desc = 0;
+	return ret;
+}
+
+static void qcom_qspi_dma_xfer(struct qcom_qspi *ctrl)
+{
+	/* Ack any previous interrupts that might be hanging around */
+	writel(DMA_CHAIN_DONE, ctrl->base + MSTR_INT_STATUS);
+
+	/* Setup new interrupts */
+	writel(DMA_CHAIN_DONE, ctrl->base + MSTR_INT_EN);
+
+	/* flush all writes */
+	wmb();
+
+	/* kick off transfer */
+	writel((uint32_t)(uintptr_t)(ctrl->dma_cmd_desc)[0], ctrl->base + NEXT_DMA_DESC_ADDR);
+}
+
+/* Switch to DMA if transfer length exceeds this */
+#define QSPI_MAX_BYTES_FIFO 64
+
+static bool qcom_qspi_can_dma(struct spi_controller *ctlr,
+			 struct spi_device *slv, struct spi_transfer *xfer)
+{
+	return xfer->len > QSPI_MAX_BYTES_FIFO ? true : false;
+}
+
 static int qcom_qspi_transfer_one(struct spi_master *master,
 				  struct spi_device *slv,
 				  struct spi_transfer *xfer)
@@ -266,6 +529,7 @@  static int qcom_qspi_transfer_one(struct spi_master *master,
 	int ret;
 	unsigned long speed_hz;
 	unsigned long flags;
+	u32 mstr_cfg;
 
 	speed_hz = slv->max_speed_hz;
 	if (xfer->speed_hz)
@@ -276,6 +540,7 @@  static int qcom_qspi_transfer_one(struct spi_master *master,
 		return ret;
 
 	spin_lock_irqsave(&ctrl->lock, flags);
+	mstr_cfg = readl(ctrl->base + MSTR_CONFIG);
 
 	/* We are half duplex, so either rx or tx will be set */
 	if (xfer->rx_buf) {
@@ -290,7 +555,25 @@  static int qcom_qspi_transfer_one(struct spi_master *master,
 	ctrl->xfer.is_last = list_is_last(&xfer->transfer_list,
 					  &master->cur_msg->transfers);
 	ctrl->xfer.rem_bytes = xfer->len;
-	qcom_qspi_pio_xfer(ctrl);
+
+	if (qcom_qspi_can_dma(master, slv, xfer)) {
+		ctrl->xfer_mode = QSPI_DMA;
+		ctrl->iomode = qspi_buswidth_to_iomode(ctrl, ctrl->xfer.buswidth);
+		mstr_cfg |= DMA_ENABLE;
+		writel(mstr_cfg, ctrl->base + MSTR_CONFIG);
+
+		ret = qcom_qspi_setup_dma_desc(ctrl, xfer);
+		if (ret) {
+			spin_unlock_irqrestore(&ctrl->lock, flags);
+			return ret;
+		}
+		qcom_qspi_dma_xfer(ctrl);
+	} else {
+		ctrl->xfer_mode = QSPI_FIFO;
+		mstr_cfg &= ~DMA_ENABLE;
+		writel(mstr_cfg, ctrl->base + MSTR_CONFIG);
+		qcom_qspi_pio_xfer(ctrl);
+	}
 
 	spin_unlock_irqrestore(&ctrl->lock, flags);
 
@@ -328,6 +611,40 @@  static int qcom_qspi_prepare_message(struct spi_master *master,
 	return 0;
 }
 
+static int qcom_qspi_alloc_dma(struct qcom_qspi *ctrl)
+{
+	/* allocate for cmd descriptors pool */
+	ctrl->dma_cmd_pool = dma_pool_create("qspi cmd desc pool",
+		ctrl->dev, sizeof(struct qspi_cmd_desc), 0, 0);
+	if (!ctrl->dma_cmd_pool) {
+		dev_err(ctrl->dev, "Could not create dma pool for cmd_desc\n");
+		return -ENOMEM;
+	}
+
+	/*
+	 * allocate for data descriptors pool as per alignment
+	 * and boundary requirements
+	 */
+	ctrl->dma_data_pool = dma_pool_create("qspi data desc pool",
+		ctrl->dev, QSPI_ALIGN_REQ, QSPI_ALIGN_REQ, QSPI_BOUNDARY_REQ);
+	if (!ctrl->dma_data_pool) {
+		dev_err(ctrl->dev, "Could not create dma pool for data desc\n");
+		dma_pool_destroy(ctrl->dma_cmd_pool);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void qcom_qspi_free_dma(struct qcom_qspi *ctrl)
+{
+	/* free pool buffers */
+	dma_pool_destroy(ctrl->dma_data_pool);
+
+	/* free pool */
+	dma_pool_destroy(ctrl->dma_cmd_pool);
+}
+
 static irqreturn_t pio_read(struct qcom_qspi *ctrl)
 {
 	u32 rd_fifo_status;
@@ -426,27 +743,63 @@  static irqreturn_t qcom_qspi_irq(int irq, void *dev_id)
 	int_status = readl(ctrl->base + MSTR_INT_STATUS);
 	writel(int_status, ctrl->base + MSTR_INT_STATUS);
 
-	if (ctrl->xfer.dir == QSPI_WRITE) {
-		if (int_status & WR_FIFO_EMPTY)
-			ret = pio_write(ctrl);
-	} else {
-		if (int_status & RESP_FIFO_RDY)
-			ret = pio_read(ctrl);
-	}
-
-	if (int_status & QSPI_ERR_IRQS) {
-		if (int_status & RESP_FIFO_UNDERRUN)
-			dev_err(ctrl->dev, "IRQ error: FIFO underrun\n");
-		if (int_status & WR_FIFO_OVERRUN)
-			dev_err(ctrl->dev, "IRQ error: FIFO overrun\n");
-		if (int_status & HRESP_FROM_NOC_ERR)
-			dev_err(ctrl->dev, "IRQ error: NOC response error\n");
-		ret = IRQ_HANDLED;
-	}
-
-	if (!ctrl->xfer.rem_bytes) {
-		writel(0, ctrl->base + MSTR_INT_EN);
-		spi_finalize_current_transfer(dev_get_drvdata(ctrl->dev));
+	switch (ctrl->xfer_mode) {
+	case QSPI_FIFO:
+		if (ctrl->xfer.dir == QSPI_WRITE) {
+			if (int_status & WR_FIFO_EMPTY)
+				ret = pio_write(ctrl);
+		} else {
+			if (int_status & RESP_FIFO_RDY)
+				ret = pio_read(ctrl);
+		}
+
+		if (int_status & QSPI_ERR_IRQS) {
+			if (int_status & RESP_FIFO_UNDERRUN)
+				dev_err(ctrl->dev, "IRQ error: FIFO underrun\n");
+			if (int_status & WR_FIFO_OVERRUN)
+				dev_err(ctrl->dev, "IRQ error: FIFO overrun\n");
+			if (int_status & HRESP_FROM_NOC_ERR)
+				dev_err(ctrl->dev, "IRQ error: NOC response error\n");
+			ret = IRQ_HANDLED;
+		}
+
+		if (!ctrl->xfer.rem_bytes) {
+			writel(0, ctrl->base + MSTR_INT_EN);
+			spi_finalize_current_transfer(dev_get_drvdata(ctrl->dev));
+		}
+		break;
+	case QSPI_DMA:
+		if (int_status & DMA_CHAIN_DONE) {
+			int i;
+
+			writel(0, ctrl->base + MSTR_INT_EN);
+
+			if (ctrl->xfer.dir == QSPI_READ) {
+				struct qspi_cmd_desc *cmd_desc;
+
+				for (i = 0; i < ctrl->n_cmd_desc; i++) {
+					cmd_desc = (struct qspi_cmd_desc *)ctrl->virt_cmd_desc[i];
+					memcpy(cmd_desc->bounce_dst,
+						cmd_desc->bounce_src, cmd_desc->bounce_length);
+				}
+			}
+
+			for (i = 0; i < ctrl->n_data_desc; i++)
+				dma_pool_free(ctrl->dma_data_pool, ctrl->virt_data_desc[i],
+						  ctrl->dma_data_desc[i]);
+			ctrl->n_data_desc = 0;
+
+			for (i = 0; i < ctrl->n_cmd_desc; i++)
+				dma_pool_free(ctrl->dma_cmd_pool, ctrl->virt_cmd_desc[i],
+						  ctrl->dma_cmd_desc[i]);
+			ctrl->n_cmd_desc = 0;
+
+			ret = IRQ_HANDLED;
+			spi_finalize_current_transfer(dev_get_drvdata(ctrl->dev));
+		}
+		break;
+	default:
+		dev_err(ctrl->dev, "Unknown xfer mode:%d", ctrl->xfer_mode);
 	}
 
 	spin_unlock(&ctrl->lock);
@@ -487,6 +840,9 @@  static int qcom_qspi_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
+	/* set default mode to FIFO */
+	ctrl->xfer_mode = QSPI_FIFO;
+
 	ctrl->icc_path_cpu_to_qspi = devm_of_icc_get(dev, "qspi-config");
 	if (IS_ERR(ctrl->icc_path_cpu_to_qspi))
 		return dev_err_probe(dev, PTR_ERR(ctrl->icc_path_cpu_to_qspi),
@@ -517,7 +873,12 @@  static int qcom_qspi_probe(struct platform_device *pdev)
 		return ret;
 	}
 
+	ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32));
+	if (ret)
+		return dev_err_probe(dev, ret, "could not set DMA mask\n");
+
 	master->max_speed_hz = 300000000;
+	master->max_dma_len = 65536; /* as per HPG */
 	master->num_chipselect = QSPI_NUM_CS;
 	master->bus_num = -1;
 	master->dev.of_node = pdev->dev.of_node;
@@ -528,6 +889,7 @@  static int qcom_qspi_probe(struct platform_device *pdev)
 	master->prepare_message = qcom_qspi_prepare_message;
 	master->transfer_one = qcom_qspi_transfer_one;
 	master->handle_err = qcom_qspi_handle_err;
+	master->can_dma = qcom_qspi_can_dma;
 	master->auto_runtime_pm = true;
 
 	ret = devm_pm_opp_set_clkname(&pdev->dev, "core");
@@ -540,6 +902,11 @@  static int qcom_qspi_probe(struct platform_device *pdev)
 		return ret;
 	}
 
+	/* allocate for DMA descriptor pools */
+	ret = qcom_qspi_alloc_dma(ctrl);
+	if (ret)
+		return ret;
+
 	pm_runtime_use_autosuspend(dev);
 	pm_runtime_set_autosuspend_delay(dev, 250);
 	pm_runtime_enable(dev);
@@ -556,10 +923,15 @@  static int qcom_qspi_probe(struct platform_device *pdev)
 static void qcom_qspi_remove(struct platform_device *pdev)
 {
 	struct spi_master *master = platform_get_drvdata(pdev);
+	struct qcom_qspi *ctrl;
+
+	ctrl = spi_master_get_devdata(master);
 
 	/* Unregister _before_ disabling pm_runtime() so we stop transfers */
 	spi_unregister_master(master);
 
+	qcom_qspi_free_dma(ctrl);
+
 	pm_runtime_disable(&pdev->dev);
 }