diff mbox series

arm: stm32mp1: activate data cache in SPL and before relocation

Message ID 20200330094610.1.I2ff601b652f4995a3401dc67c2369a4187046ed8@changeid
State New
Headers show
Series arm: stm32mp1: activate data cache in SPL and before relocation | expand

Commit Message

Patrick Delaunay March 30, 2020, 7:46 a.m. UTC
Activate the data cache in SPL and in U-Boot before relocation
- before relocation, the TLB is located after U-Boot
  (CONFIG_SYS_TEXT_BASE and an assumed 2MB max size)
  and all the DDR is marked cacheable
- in SPL, the TLB is located at the end of SYSRAM, just after the stack
  (CONFIG_SPL_STACK) with size PGTABLE_SIZE = 16kB
  and all the SYSRAM is marked cacheable

This patch allows to reduce the execution time, particularly
- for the device tree parsing in U-Boot pre-reloc stage
  (dm_extended_scan_fd =>dm_scan_fdt)
- in I2C timing computation in SPL (stm32_i2c_choose_solution())

For example, the result on STM32MP157C-DK2 board is:
   1,6s gain for trusted boot chain with TF-A
   2,2s gain for basic boot chain with SPL

Signed-off-by: Patrick Delaunay <patrick.delaunay at st.com>
---

Example of bootstage report on STM32MP157C-DK2

1/ For trusted boot chain with TF-A

a) Before the patch

    STM32MP> bootstage report
    Timer summary in microseconds (9 records):
           Mark    Elapsed  Stage
              0          0  reset
        583,290    583,290  board_init_f
      2,348,898  1,765,608  board_init_r
      2,664,580    315,682  id=64
      2,704,027     39,447  id=65
      2,704,729        702  main_loop
      5,563,519  2,858,790  id=175

    Accumulated time:
                    41,696  dm_r
                   615,561  dm_f

b) After the patch
    STM32MP> bootstage report

Timer summary in microseconds (9 records):
       Mark    Elapsed  Stage
          0          0  reset
    577,841    577,841  board_init_f
    722,178    144,337  board_init_r
  1,038,458    316,280  id=64
  1,078,298     39,840  id=65
  1,078,999        701  main_loop
  4,169,020  3,090,021  id=175

Accumulated time:
                36,330  dm_f
                41,999  dm_r

2/ And for the basic boot chain with SPL

a) Before the patch:

    STM32MP> bootstage report
    Timer summary in microseconds (12 records):
           Mark    Elapsed  Stage
              0          0  reset
        195,613    195,613  SPL
        837,867    642,254  end SPL
        840,117      2,250  board_init_f
      2,739,639  1,899,522  board_init_r
      3,066,815    327,176  id=64
      3,103,377     36,562  id=65
      3,104,078        701  main_loop
      3,142,171     38,093  id=175

    Accumulated time:
                    38,124  dm_spl
                    41,956  dm_r
                   648,861  dm_f

b) After the patch

    STM32MP> bootstage report
    Timer summary in microseconds (12 records):
           Mark    Elapsed  Stage
              0          0  reset
        195,859    195,859  SPL
        330,190    134,331  end SPL
        332,408      2,218  board_init_f
        482,688    150,280  board_init_r
        808,694    326,006  id=64
        845,029     36,335  id=65
        845,730        701  main_loop
      3,281,876  2,436,146  id=175

    Accumulated time:
                     3,169  dm_spl
                    36,041  dm_f
                    41,701  dm_r


 arch/arm/mach-stm32mp/cpu.c       | 28 ++++++++++++++++++++++++-
 arch/arm/mach-stm32mp/dram_init.c | 35 +++++++++++++++++++++++++++++++
 include/configs/stm32mp1.h        |  4 ++--
 3 files changed, 64 insertions(+), 3 deletions(-)

Comments

Marek Vasut March 30, 2020, 9:56 a.m. UTC | #1
On 3/30/20 9:46 AM, Patrick Delaunay wrote:
[...]

> 2/ And for the basic boot chain with SPL
> 
> a) Before the patch:
> 
>     STM32MP> bootstage report
>     Timer summary in microseconds (12 records):
>            Mark    Elapsed  Stage
>               0          0  reset
>         195,613    195,613  SPL
>         837,867    642,254  end SPL
>         840,117      2,250  board_init_f
>       2,739,639  1,899,522  board_init_r
>       3,066,815    327,176  id=64
>       3,103,377     36,562  id=65
>       3,104,078        701  main_loop
>       3,142,171     38,093  id=175
> 
>     Accumulated time:
>                     38,124  dm_spl
>                     41,956  dm_r
>                    648,861  dm_f
> 
> b) After the patch
> 
>     STM32MP> bootstage report
>     Timer summary in microseconds (12 records):
>            Mark    Elapsed  Stage
>               0          0  reset
>         195,859    195,859  SPL
>         330,190    134,331  end SPL
>         332,408      2,218  board_init_f
>         482,688    150,280  board_init_r
>         808,694    326,006  id=64
>         845,029     36,335  id=65
>         845,730        701  main_loop
>       3,281,876  2,436,146  id=175
> 
>     Accumulated time:
>                      3,169  dm_spl
>                     36,041  dm_f
>                     41,701  dm_r

Nice.

[...]

> diff --git a/arch/arm/mach-stm32mp/cpu.c b/arch/arm/mach-stm32mp/cpu.c
> index 36a9205819..579468a1a0 100644
> --- a/arch/arm/mach-stm32mp/cpu.c
> +++ b/arch/arm/mach-stm32mp/cpu.c
> @@ -193,6 +193,26 @@ int arch_cpu_init(void)
>  {
>  	u32 boot_mode;
>  
> +	/*
> +	 * initialialize the MMU and activate data cache
> +	 * in SPL or in U- Boot pre-reloc stage
> +	 */
> +	gd->arch.tlb_size = PGTABLE_SIZE;
> +#if defined(CONFIG_SPL_BUILD)
> +#if (STM32_SYSRAM_BASE + STM32_SYSRAM_SIZE - CONFIG_SPL_STACK < PGTABLE_SIZE)
> +#error "The reserved memory for SPL PGTABLE is not enough."
> +#endif

Move this check to the beginning of the file, out of the code.

> +	gd->arch.tlb_addr = CONFIG_SPL_STACK;

if (IS_ENABLED(SPL_BUILD))
 gd->....
else
 gd->....

> +#else
> +	/* TLB is located after U-Boot, assumed 2MB as max size */
> +	gd->arch.tlb_addr = CONFIG_SYS_TEXT_BASE + SZ_2M;
> +#endif

I think you also need to set arch.tlb_size . Also, this should be a
separate function.

> +	dcache_enable();

What about icache, shouldn't that one be enabled too ?

> +	/*
> +	 * MMU/TLB is updated in enable_caches() for U-Boot after relocation
> +	 * or is deactivated in U-Boot start.S::cpu_init_cp15 for SPL
> +	 */
> +
>  	/* early armv7 timer init: needed for polling */
>  	timer_init();
>  
> @@ -225,7 +245,13 @@ int arch_cpu_init(void)
>  
>  void enable_caches(void)
>  {

Icache should be enabled here too. You don't know what e.g. a debugger
did to caches.

> -	/* Enable D-cache. I-cache is already enabled in start.S */
> +	/* I-cache is already enabled in start.S */
> +	/* deactivate the data cache, early enabled in arch_cpu_init() */
> +	dcache_disable();
> +	/*
> +	 * update MMU after relocation, the TLB location was udpated in
> +	 * board_f.c::reserve_mmu, and enable the data cache
> +	 */
>  	dcache_enable();
>  }
>  
> diff --git a/arch/arm/mach-stm32mp/dram_init.c b/arch/arm/mach-stm32mp/dram_init.c
> index 3233415eff..15b39ecc03 100644
> --- a/arch/arm/mach-stm32mp/dram_init.c
> +++ b/arch/arm/mach-stm32mp/dram_init.c
> @@ -7,9 +7,29 @@
>  #include <dm.h>
>  #include <lmb.h>
>  #include <ram.h>
> +#include <asm/cache.h>
>  
>  DECLARE_GLOBAL_DATA_PTR;
>  
> +static void set_mmu_dcache(u32 addr, u32 size)
> +{
> +	int	i;
> +	u32 start, end;
> +
> +	start = addr >> MMU_SECTION_SHIFT;
> +	end = ((u64)addr + (u64)size) >> MMU_SECTION_SHIFT;

Is this a copy of dram_bank_mmu_setup() in arch/arm/lib/cache-cp15.c ?
Why ?

> +	for (i = start; i < end; i++) {
> +#if defined(CONFIG_SYS_ARM_CACHE_WRITETHROUGH)
> +		set_section_dcache(i, DCACHE_WRITETHROUGH);
> +#elif defined(CONFIG_SYS_ARM_CACHE_WRITEALLOC)
> +		set_section_dcache(i, DCACHE_WRITEALLOC);
> +#else
> +		set_section_dcache(i, DCACHE_WRITEBACK);
> +#endif
> +	}
> +}

[...]

> diff --git a/include/configs/stm32mp1.h b/include/configs/stm32mp1.h
> index c34a720e0c..5203fc93ad 100644
> --- a/include/configs/stm32mp1.h
> +++ b/include/configs/stm32mp1.h
> @@ -58,8 +58,8 @@
>  
>  /* limit SYSRAM usage to first 128 KB */
>  #define CONFIG_SPL_MAX_SIZE		0x00020000
> -#define CONFIG_SPL_STACK		(STM32_SYSRAM_BASE + \
> -					 STM32_SYSRAM_SIZE)
> +/* stack at STM32_SYSRAM_BASE + STM32_SYSRAM_SIZE - PGTABLE_SIZE (=16kB) */
> +#define CONFIG_SPL_STACK		0x2FFFC000

Can't you memalign() the pagetable area instead of this hacking around?
Or use something around board_init_f_alloc_reserve().
Patrick Delaunay March 30, 2020, 1:49 p.m. UTC | #2
Hi Marek,

> From: Marek Vasut <marex at denx.de>
> Sent: lundi 30 mars 2020 11:57
> 
> On 3/30/20 9:46 AM, Patrick Delaunay wrote:
> [...]
> 
> > 2/ And for the basic boot chain with SPL
> >
> > a) Before the patch:
> >
> >     STM32MP> bootstage report
> >     Timer summary in microseconds (12 records):
> >            Mark    Elapsed  Stage
> >               0          0  reset
> >         195,613    195,613  SPL
> >         837,867    642,254  end SPL
> >         840,117      2,250  board_init_f
> >       2,739,639  1,899,522  board_init_r
> >       3,066,815    327,176  id=64
> >       3,103,377     36,562  id=65
> >       3,104,078        701  main_loop
> >       3,142,171     38,093  id=175
> >
> >     Accumulated time:
> >                     38,124  dm_spl
> >                     41,956  dm_r
> >                    648,861  dm_f
> >
> > b) After the patch
> >
> >     STM32MP> bootstage report
> >     Timer summary in microseconds (12 records):
> >            Mark    Elapsed  Stage
> >               0          0  reset
> >         195,859    195,859  SPL
> >         330,190    134,331  end SPL
> >         332,408      2,218  board_init_f
> >         482,688    150,280  board_init_r
> >         808,694    326,006  id=64
> >         845,029     36,335  id=65
> >         845,730        701  main_loop
> >       3,281,876  2,436,146  id=175
> >
> >     Accumulated time:
> >                      3,169  dm_spl
> >                     36,041  dm_f
> >                     41,701  dm_r
> 
> Nice.
> 
> [...]
> 
> > diff --git a/arch/arm/mach-stm32mp/cpu.c b/arch/arm/mach-stm32mp/cpu.c
> > index 36a9205819..579468a1a0 100644
> > --- a/arch/arm/mach-stm32mp/cpu.c
> > +++ b/arch/arm/mach-stm32mp/cpu.c
> > @@ -193,6 +193,26 @@ int arch_cpu_init(void)  {
> >  	u32 boot_mode;
> >
> > +	/*
> > +	 * initialialize the MMU and activate data cache
> > +	 * in SPL or in U- Boot pre-reloc stage
> > +	 */
> > +	gd->arch.tlb_size = PGTABLE_SIZE;
> > +#if defined(CONFIG_SPL_BUILD)
> > +#if (STM32_SYSRAM_BASE + STM32_SYSRAM_SIZE -
> CONFIG_SPL_STACK <
> > +PGTABLE_SIZE) #error "The reserved memory for SPL PGTABLE is not
> enough."
> > +#endif
> 
> Move this check to the beginning of the file, out of the code.

Ok

> > +	gd->arch.tlb_addr = CONFIG_SPL_STACK;
> 
> if (IS_ENABLED(SPL_BUILD))
>  gd->....
> else
>  gd->....

Yes with

IS_ENABLED(CONFIG_SPL_BUILD)

> > +#else
> > +	/* TLB is located after U-Boot, assumed 2MB as max size */
> > +	gd->arch.tlb_addr = CONFIG_SYS_TEXT_BASE + SZ_2M; #endif
> 
> I think you also need to set arch.tlb_size . Also, this should be a separate function.

It is done before the test / common for SPL or U-Boot case.

	gd->arch.tlb_size = PGTABLE_SIZE;

but I will create a function early_early_caches()

> > +	dcache_enable();
> 
> What about icache, shouldn't that one be enabled too ?

It is not needed... 
I-cache is already activated in start.S::cpu_init_cp15 for arm V7

But I will add a comment.

 
> > +	/*
> > +	 * MMU/TLB is updated in enable_caches() for U-Boot after relocation
> > +	 * or is deactivated in U-Boot start.S::cpu_init_cp15 for SPL
> > +	 */
> > +
> >  	/* early armv7 timer init: needed for polling */
> >  	timer_init();
> >
> > @@ -225,7 +245,13 @@ int arch_cpu_init(void)
> >
> >  void enable_caches(void)
> >  {
> 
> Icache should be enabled here too. You don't know what e.g. a debugger did to
> caches.
> 
> > -	/* Enable D-cache. I-cache is already enabled in start.S */
> > +	/* I-cache is already enabled in start.S */

Not needed for arm V7 (I copy this function from other platfrom ... I don't remember which one)

I-cache is already activated in start.S::cpu_init_cp15 as indicated in this comment

> > +	/* deactivate the data cache, early enabled in arch_cpu_init() */
> > +	dcache_disable();
> > +	/*
> > +	 * update MMU after relocation, the TLB location was udpated in
> > +	 * board_f.c::reserve_mmu, and enable the data cache
> > +	 */
> >  	dcache_enable();
> >  }
> >
> > diff --git a/arch/arm/mach-stm32mp/dram_init.c
> > b/arch/arm/mach-stm32mp/dram_init.c
> > index 3233415eff..15b39ecc03 100644
> > --- a/arch/arm/mach-stm32mp/dram_init.c
> > +++ b/arch/arm/mach-stm32mp/dram_init.c
> > @@ -7,9 +7,29 @@
> >  #include <dm.h>
> >  #include <lmb.h>
> >  #include <ram.h>
> > +#include <asm/cache.h>
> >
> >  DECLARE_GLOBAL_DATA_PTR;
> >
> > +static void set_mmu_dcache(u32 addr, u32 size) {
> > +	int	i;
> > +	u32 start, end;
> > +
> > +	start = addr >> MMU_SECTION_SHIFT;
> > +	end = ((u64)addr + (u64)size) >> MMU_SECTION_SHIFT;
> 
> Is this a copy of dram_bank_mmu_setup() in arch/arm/lib/cache-cp15.c ?
> Why ?

It is not just a copy...

set__mmu_dache is only a static helper for  function dram_bank_mmu_setup()

I override the default implementation of the weak functon dram_bank_mmu_setup()

1/ mark  SYSRAM cacheable in SPL (embedded RAM used by SPL)

2/ mark beginning of DDR cacheable in U-Boot pre-reloc (today all the DDR)
    => I prepare a possible TF-A limitation: when TF-A is running in DDR,
         a part of DDR is securized and can't be mapped to avoid speculative access 

3/ after relocation, DDR init is performed.... 
    use the default implementation to map all the DDR (gd->bd->bi_dram[0])

   PS: in future patches, I will only limit this case for reserved memory part,
           with "no-map" tag (also for TF-A requirement)

Now after my explcation I found a issue in my patch...
SPL also use DDR (at least for CONFIG_SYS_SPL_MALLOC_START	0xC0300000) ,
 so I need to marked DDR as cacheache also for SPL

> > +	for (i = start; i < end; i++) {
> > +#if defined(CONFIG_SYS_ARM_CACHE_WRITETHROUGH)
> > +		set_section_dcache(i, DCACHE_WRITETHROUGH); #elif
> > +defined(CONFIG_SYS_ARM_CACHE_WRITEALLOC)
> > +		set_section_dcache(i, DCACHE_WRITEALLOC); #else
> > +		set_section_dcache(i, DCACHE_WRITEBACK); #endif
> > +	}
> > +}
> 
> [...]
> 
> > diff --git a/include/configs/stm32mp1.h b/include/configs/stm32mp1.h
> > index c34a720e0c..5203fc93ad 100644
> > --- a/include/configs/stm32mp1.h
> > +++ b/include/configs/stm32mp1.h
> > @@ -58,8 +58,8 @@
> >
> >  /* limit SYSRAM usage to first 128 KB */
> >  #define CONFIG_SPL_MAX_SIZE		0x00020000
> > -#define CONFIG_SPL_STACK		(STM32_SYSRAM_BASE + \
> > -					 STM32_SYSRAM_SIZE)
> > +/* stack at STM32_SYSRAM_BASE + STM32_SYSRAM_SIZE -
> PGTABLE_SIZE (=16kB) */
> > +#define CONFIG_SPL_STACK		0x2FFFC000
> 
> Can't you memalign() the pagetable area instead of this hacking around?
> Or use something around board_init_f_alloc_reserve().

It was my first idea: use malloc

But as I try to activate the data cache as soon as possible.
So before spl_early_init call (for spl in board_init_f) and malloc is not yet accessible.

And board_init_f_alloc_reserve  is only called in U-Boot board-f.c..... 
after the MMU configuration for pre-relocation / not called in SPL.

I don't see this address as hack but a memory configuration of SYSRAM:

SYRAM content (board_f)
- SPL code
- SPL data
- SPL stack (reversed order)
- TTB 

But I can move it in BSS as global apl variable, I need to think about it....
It is probably more clean.

Regards

Patrick
Marek Vasut March 30, 2020, 2:03 p.m. UTC | #3
On 3/30/20 3:49 PM, Patrick DELAUNAY wrote:
> Hi Marek,

Hi,

[...]

>>> -	/* Enable D-cache. I-cache is already enabled in start.S */
>>> +	/* I-cache is already enabled in start.S */
> 
> Not needed for arm V7 (I copy this function from other platfrom ... I don't remember which one)

Maybe this needs to be de-duplicated if it's a copy ?

[...]

>>> diff --git a/arch/arm/mach-stm32mp/dram_init.c
>>> b/arch/arm/mach-stm32mp/dram_init.c
>>> index 3233415eff..15b39ecc03 100644
>>> --- a/arch/arm/mach-stm32mp/dram_init.c
>>> +++ b/arch/arm/mach-stm32mp/dram_init.c
>>> @@ -7,9 +7,29 @@
>>>  #include <dm.h>
>>>  #include <lmb.h>
>>>  #include <ram.h>
>>> +#include <asm/cache.h>
>>>
>>>  DECLARE_GLOBAL_DATA_PTR;
>>>
>>> +static void set_mmu_dcache(u32 addr, u32 size) {
>>> +	int	i;
>>> +	u32 start, end;
>>> +
>>> +	start = addr >> MMU_SECTION_SHIFT;
>>> +	end = ((u64)addr + (u64)size) >> MMU_SECTION_SHIFT;
>>
>> Is this a copy of dram_bank_mmu_setup() in arch/arm/lib/cache-cp15.c ?
>> Why ?
> 
> It is not just a copy...
> 
> set__mmu_dache is only a static helper for  function dram_bank_mmu_setup()
> 
> I override the default implementation of the weak functon dram_bank_mmu_setup()

Can you instead augment the original implementation to cater for this
usecase or would that be too difficult ?

> 1/ mark  SYSRAM cacheable in SPL (embedded RAM used by SPL)
> 
> 2/ mark beginning of DDR cacheable in U-Boot pre-reloc (today all the DDR)
>     => I prepare a possible TF-A limitation: when TF-A is running in DDR,
>          a part of DDR is securized and can't be mapped to avoid speculative access 
> 
> 3/ after relocation, DDR init is performed.... 
>     use the default implementation to map all the DDR (gd->bd->bi_dram[0])
> 
>    PS: in future patches, I will only limit this case for reserved memory part,
>            with "no-map" tag (also for TF-A requirement)

OK, TF-A is just adding more and more problems.

> Now after my explcation I found a issue in my patch...
> SPL also use DDR (at least for CONFIG_SYS_SPL_MALLOC_START	0xC0300000) ,
>  so I need to marked DDR as cacheache also for SPL
> 
>>> +	for (i = start; i < end; i++) {
>>> +#if defined(CONFIG_SYS_ARM_CACHE_WRITETHROUGH)
>>> +		set_section_dcache(i, DCACHE_WRITETHROUGH); #elif
>>> +defined(CONFIG_SYS_ARM_CACHE_WRITEALLOC)
>>> +		set_section_dcache(i, DCACHE_WRITEALLOC); #else
>>> +		set_section_dcache(i, DCACHE_WRITEBACK); #endif
>>> +	}
>>> +}
>>
>> [...]
>>
>>> diff --git a/include/configs/stm32mp1.h b/include/configs/stm32mp1.h
>>> index c34a720e0c..5203fc93ad 100644
>>> --- a/include/configs/stm32mp1.h
>>> +++ b/include/configs/stm32mp1.h
>>> @@ -58,8 +58,8 @@
>>>
>>>  /* limit SYSRAM usage to first 128 KB */
>>>  #define CONFIG_SPL_MAX_SIZE		0x00020000
>>> -#define CONFIG_SPL_STACK		(STM32_SYSRAM_BASE + \
>>> -					 STM32_SYSRAM_SIZE)
>>> +/* stack at STM32_SYSRAM_BASE + STM32_SYSRAM_SIZE -
>> PGTABLE_SIZE (=16kB) */
>>> +#define CONFIG_SPL_STACK		0x2FFFC000
>>
>> Can't you memalign() the pagetable area instead of this hacking around?
>> Or use something around board_init_f_alloc_reserve().
> 
> It was my first idea: use malloc
> 
> But as I try to activate the data cache as soon as possible.
> So before spl_early_init call (for spl in board_init_f) and malloc is not yet accessible.
> 
> And board_init_f_alloc_reserve  is only called in U-Boot board-f.c..... 
> after the MMU configuration for pre-relocation / not called in SPL.
> 
> I don't see this address as hack but a memory configuration of SYSRAM:
> 
> SYRAM content (board_f)
> - SPL code
> - SPL data
> - SPL stack (reversed order)
> - TTB 
> 
> But I can move it in BSS as global apl variable, I need to think about it....
> It is probably more clean.

Please do :)
diff mbox series

Patch

diff --git a/arch/arm/mach-stm32mp/cpu.c b/arch/arm/mach-stm32mp/cpu.c
index 36a9205819..579468a1a0 100644
--- a/arch/arm/mach-stm32mp/cpu.c
+++ b/arch/arm/mach-stm32mp/cpu.c
@@ -193,6 +193,26 @@  int arch_cpu_init(void)
 {
 	u32 boot_mode;
 
+	/*
+	 * initialialize the MMU and activate data cache
+	 * in SPL or in U- Boot pre-reloc stage
+	 */
+	gd->arch.tlb_size = PGTABLE_SIZE;
+#if defined(CONFIG_SPL_BUILD)
+#if (STM32_SYSRAM_BASE + STM32_SYSRAM_SIZE - CONFIG_SPL_STACK < PGTABLE_SIZE)
+#error "The reserved memory for SPL PGTABLE is not enough."
+#endif
+	gd->arch.tlb_addr = CONFIG_SPL_STACK;
+#else
+	/* TLB is located after U-Boot, assumed 2MB as max size */
+	gd->arch.tlb_addr = CONFIG_SYS_TEXT_BASE + SZ_2M;
+#endif
+	dcache_enable();
+	/*
+	 * MMU/TLB is updated in enable_caches() for U-Boot after relocation
+	 * or is deactivated in U-Boot start.S::cpu_init_cp15 for SPL
+	 */
+
 	/* early armv7 timer init: needed for polling */
 	timer_init();
 
@@ -225,7 +245,13 @@  int arch_cpu_init(void)
 
 void enable_caches(void)
 {
-	/* Enable D-cache. I-cache is already enabled in start.S */
+	/* I-cache is already enabled in start.S */
+	/* deactivate the data cache, early enabled in arch_cpu_init() */
+	dcache_disable();
+	/*
+	 * update MMU after relocation, the TLB location was udpated in
+	 * board_f.c::reserve_mmu, and enable the data cache
+	 */
 	dcache_enable();
 }
 
diff --git a/arch/arm/mach-stm32mp/dram_init.c b/arch/arm/mach-stm32mp/dram_init.c
index 3233415eff..15b39ecc03 100644
--- a/arch/arm/mach-stm32mp/dram_init.c
+++ b/arch/arm/mach-stm32mp/dram_init.c
@@ -7,9 +7,29 @@ 
 #include <dm.h>
 #include <lmb.h>
 #include <ram.h>
+#include <asm/cache.h>
 
 DECLARE_GLOBAL_DATA_PTR;
 
+static void set_mmu_dcache(u32 addr, u32 size)
+{
+	int	i;
+	u32 start, end;
+
+	start = addr >> MMU_SECTION_SHIFT;
+	end = ((u64)addr + (u64)size) >> MMU_SECTION_SHIFT;
+
+	for (i = start; i < end; i++) {
+#if defined(CONFIG_SYS_ARM_CACHE_WRITETHROUGH)
+		set_section_dcache(i, DCACHE_WRITETHROUGH);
+#elif defined(CONFIG_SYS_ARM_CACHE_WRITEALLOC)
+		set_section_dcache(i, DCACHE_WRITEALLOC);
+#else
+		set_section_dcache(i, DCACHE_WRITEBACK);
+#endif
+	}
+}
+
 int dram_init(void)
 {
 	struct ram_info ram;
@@ -49,3 +69,18 @@  ulong board_get_usable_ram_top(ulong total_size)
 
 	return gd->ram_top;
 }
+
+void dram_bank_mmu_setup(int bank)
+{
+	if (gd->flags & GD_FLG_RELOC) {
+		debug("%s: bank: %d\n", __func__, bank);
+		set_mmu_dcache(gd->bd->bi_dram[bank].start,
+			       gd->bd->bi_dram[bank].size);
+	} else {
+#if defined(CONFIG_SPL_BUILD)
+		set_mmu_dcache(STM32_SYSRAM_BASE, STM32_SYSRAM_SIZE);
+#else
+		set_mmu_dcache(STM32_DDR_BASE, STM32_DDR_SIZE);
+#endif
+	}
+}
diff --git a/include/configs/stm32mp1.h b/include/configs/stm32mp1.h
index c34a720e0c..5203fc93ad 100644
--- a/include/configs/stm32mp1.h
+++ b/include/configs/stm32mp1.h
@@ -58,8 +58,8 @@ 
 
 /* limit SYSRAM usage to first 128 KB */
 #define CONFIG_SPL_MAX_SIZE		0x00020000
-#define CONFIG_SPL_STACK		(STM32_SYSRAM_BASE + \
-					 STM32_SYSRAM_SIZE)
+/* stack at STM32_SYSRAM_BASE + STM32_SYSRAM_SIZE - PGTABLE_SIZE (=16kB) */
+#define CONFIG_SPL_STACK		0x2FFFC000
 #endif /* #ifdef CONFIG_SPL */
 
 #define CONFIG_SYS_MEMTEST_START	STM32_DDR_BASE