@@ -18,6 +18,8 @@
#include <linux/const.h>
#include <asm/assembler.h>
#include <asm/page.h>
+#include <asm/cpufeature.h>
+#include <asm/alternative.h>
/*
@@ -28,6 +30,15 @@
* x1 - src
*/
ENTRY(copy_page)
+alternative_if_not ARM64_NEEDS_PREFETCH_128
+ nop
+ nop
+alternative_else
+ # Prefetch two cache lines ahead.
+ prfm pldl1strm, [x1, #128]
+ prfm pldl1strm, [x1, #256]
+alternative_endif
+
ldp x2, x3, [x1]
ldp x4, x5, [x1, #16]
ldp x6, x7, [x1, #32]
@@ -42,6 +53,12 @@ ENTRY(copy_page)
1:
subs x18, x18, #128
+alternative_if_not ARM64_NEEDS_PREFETCH_128
+ nop
+alternative_else
+ prfm pldl1strm, [x1, #384]
+alternative_endif
+
stnp x2, x3, [x0]
ldp x2, x3, [x1]
stnp x4, x5, [x0, #16]
On ThunderX T88 pass 1 and pass 2, there is no hardware prefetching so we need to patch in software prefetching. Prefetching improves this code by 60% over the original code and 2x over the code without prefetching. Meaured by using the benchmark code at https://github.com/apinski-cavium/copy_page_benchmark Signed-off-by: Andrew Pinski <apinski@cavium.com> --- arch/arm64/lib/copy_page.S | 17 +++++++++++++++++ 1 files changed, 17 insertions(+), 0 deletions(-) -- 1.7.2.5