@@ -570,6 +570,15 @@ greater than zero, and currently defaults to 2048 bytes.
This tunable is specific to i386 and x86-64.
@end deftp
+@deftp Tunable glibc.cpu.x86_rep_movsb_stop_threshold
+The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the user to
+set threshold in bytes to stop using "rep movsb". The value must be
+greater than zero, and currently defaults depends of the CPU and the
+cache size.
+
+This tunable is specific to i386 and x86-64.
+@end deftp
+
@deftp Tunable glibc.cpu.x86_rep_stosb_threshold
The @code{glibc.cpu.x86_rep_stosb_threshold} tunable allows the user to
set threshold in bytes to start using "rep stosb". The value must be
@@ -784,6 +784,14 @@ get_common_cache_info (long int *shared_ptr, long int * shared_per_thread_ptr, u
*threads_ptr = threads;
}
+static inline bool
+is_rep_movsb_stop_threshold_valid (unsigned long int v)
+{
+ unsigned long int rep_movsb_threshold
+ = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL);
+ return v > rep_movsb_threshold;
+}
+
static void
dl_init_cacheinfo (struct cpu_features *cpu_features)
{
@@ -791,7 +799,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
long int data = -1;
long int shared = -1;
long int shared_per_thread = -1;
- long int core = -1;
unsigned int threads = 0;
unsigned long int level1_icache_size = -1;
unsigned long int level1_icache_linesize = -1;
@@ -809,7 +816,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
if (cpu_features->basic.kind == arch_kind_intel)
{
data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
- core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
shared_per_thread = shared;
@@ -822,7 +828,8 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
= handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features);
level1_dcache_linesize
= handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features);
- level2_cache_size = core;
+ level2_cache_size
+ = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
level2_cache_assoc
= handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features);
level2_cache_linesize
@@ -835,12 +842,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
level4_cache_size
= handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features);
- get_common_cache_info (&shared, &shared_per_thread, &threads, core);
+ get_common_cache_info (&shared, &shared_per_thread, &threads,
+ level2_cache_size);
}
else if (cpu_features->basic.kind == arch_kind_zhaoxin)
{
data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
- core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
shared_per_thread = shared;
@@ -849,19 +856,19 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
level1_dcache_size = data;
level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC);
level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE);
- level2_cache_size = core;
+ level2_cache_size = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC);
level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE);
level3_cache_size = shared;
level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC);
level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE);
- get_common_cache_info (&shared, &shared_per_thread, &threads, core);
+ get_common_cache_info (&shared, &shared_per_thread, &threads,
+ level2_cache_size);
}
else if (cpu_features->basic.kind == arch_kind_amd)
{
data = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
- core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE);
@@ -869,7 +876,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
level1_dcache_size = data;
level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC);
level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE);
- level2_cache_size = core;
+ level2_cache_size = handle_amd (_SC_LEVEL2_CACHE_SIZE);;
level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC);
level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE);
level3_cache_size = shared;
@@ -880,12 +887,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
if (shared <= 0)
{
/* No shared L3 cache. All we have is the L2 cache. */
- shared = core;
+ shared = level2_cache_size;
}
else if (cpu_features->basic.family < 0x17)
{
/* Account for exclusive L2 and L3 caches. */
- shared += core;
+ shared += level2_cache_size;
}
shared_per_thread = shared;
@@ -1028,16 +1035,25 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
SIZE_MAX);
unsigned long int rep_movsb_stop_threshold;
- /* ERMS feature is implemented from AMD Zen3 architecture and it is
- performing poorly for data above L2 cache size. Henceforth, adding
- an upper bound threshold parameter to limit the usage of Enhanced
- REP MOVSB operations and setting its value to L2 cache size. */
- if (cpu_features->basic.kind == arch_kind_amd)
- rep_movsb_stop_threshold = core;
- /* Setting the upper bound of ERMS to the computed value of
- non-temporal threshold for architectures other than AMD. */
- else
- rep_movsb_stop_threshold = non_temporal_threshold;
+ /* If the tunable is not set or if the value is not larger than
+ x86_rep_stosb_threshold, use the default values. */
+ rep_movsb_stop_threshold = TUNABLE_GET (x86_rep_movsb_stop_threshold,
+ long int, NULL);
+ if (!TUNABLE_IS_INITIALIZED (x86_rep_movsb_stop_threshold)
+ || !is_rep_movsb_stop_threshold_valid (rep_movsb_stop_threshold))
+ {
+ /* For AMD cpus that support ERMS (Zen3+), REP MOVSB is in a lot case
+ slower than the vectorized path (and for some alignments it is really
+ slow, check BZ #30994). */
+ if (cpu_features->basic.kind == arch_kind_amd)
+ rep_movsb_stop_threshold = 0;
+ else
+ /* Setting the upper bound of ERMS to the computed value of
+ non-temporal threshold for architectures other than AMD. */
+ rep_movsb_stop_threshold = non_temporal_threshold;
+ }
+ TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
+ SIZE_MAX);
cpu_features->data_cache_size = data;
cpu_features->shared_cache_size = shared;
@@ -49,6 +49,16 @@ glibc {
# if the tunable value is set by user or not [BZ #27069].
minval: 1
}
+ x86_rep_movsb_stop_threshold {
+ # For AMD cpus that support ERMS (Zen3+), REP MOVSB is not faster
+ # than the vectorized path (and for some destination alignment it
+ # is really slow, check BZ #30994). On Intel cpus, the size limit
+ # to use ERMS is is [1/8, 1/2] of size of the chip's cache, check
+ # the dl-cacheinfo.h).
+ # This tunable allows the caller to setup the limit where to use
+ # REP MOVB on memcpy/memmove.
+ type: SIZE_T
+ }
x86_rep_stosb_threshold {
type: SIZE_T
# Since there is overhead to set up REP STOSB operation, REP STOSB