@@ -10,12 +10,15 @@
#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/debugfs.h>
+#include <linux/hash.h>
#include <linux/irq_work.h>
+#include <linux/jhash.h>
#include <linux/kcsan-checks.h>
#include <linux/kfence.h>
#include <linux/kmemleak.h>
#include <linux/list.h>
#include <linux/lockdep.h>
+#include <linux/log2.h>
#include <linux/memblock.h>
#include <linux/moduleparam.h>
#include <linux/random.h>
@@ -82,6 +85,10 @@ static const struct kernel_param_ops sample_interval_param_ops = {
};
module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600);
+/* Pool usage% threshold when currently covered allocations are skipped. */
+static unsigned long kfence_skip_covered_thresh __read_mostly = 75;
+module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644);
+
/* The pool of pages used for guard pages and objects. */
char *__kfence_pool __ro_after_init;
EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
@@ -106,6 +113,32 @@ DEFINE_STATIC_KEY_FALSE(kfence_allocation_key);
/* Gates the allocation, ensuring only one succeeds in a given period. */
atomic_t kfence_allocation_gate = ATOMIC_INIT(1);
+/*
+ * A Counting Bloom filter of allocation coverage: limits currently covered
+ * allocations of the same source filling up the pool.
+ *
+ * Assuming a range of 15%-85% unique allocations in the pool at any point in
+ * time, the below parameters provide a probablity of 0.02-0.33 for false
+ * positive hits respectively:
+ *
+ * P(alloc_traces) = (1 - e^(-HNUM * (alloc_traces / SIZE)) ^ HNUM
+ */
+#define ALLOC_COVERED_HNUM 2
+#define ALLOC_COVERED_ORDER (const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2)
+#define ALLOC_COVERED_SIZE (1 << ALLOC_COVERED_ORDER)
+#define ALLOC_COVERED_HNEXT(h) hash_32(h, ALLOC_COVERED_ORDER)
+#define ALLOC_COVERED_MASK (ALLOC_COVERED_SIZE - 1)
+static atomic_t alloc_covered[ALLOC_COVERED_SIZE];
+
+/* Stack depth used to determine uniqueness of an allocation. */
+#define UNIQUE_ALLOC_STACK_DEPTH ((size_t)8)
+
+/*
+ * Randomness for stack hashes, making the same collisions across reboots and
+ * different machines less likely.
+ */
+static u32 stack_hash_seed __ro_after_init;
+
/* Statistics counters for debugfs. */
enum kfence_counter_id {
KFENCE_COUNTER_ALLOCATED,
@@ -115,6 +148,7 @@ enum kfence_counter_id {
KFENCE_COUNTER_BUGS,
KFENCE_COUNTER_SKIP_INCOMPAT,
KFENCE_COUNTER_SKIP_CAPACITY,
+ KFENCE_COUNTER_SKIP_COVERED,
KFENCE_COUNTER_COUNT,
};
static atomic_long_t counters[KFENCE_COUNTER_COUNT];
@@ -126,11 +160,57 @@ static const char *const counter_names[] = {
[KFENCE_COUNTER_BUGS] = "total bugs",
[KFENCE_COUNTER_SKIP_INCOMPAT] = "skipped allocations (incompatible)",
[KFENCE_COUNTER_SKIP_CAPACITY] = "skipped allocations (capacity)",
+ [KFENCE_COUNTER_SKIP_COVERED] = "skipped allocations (covered)",
};
static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
/* === Internals ============================================================ */
+static inline bool should_skip_covered(void)
+{
+ unsigned long thresh = (CONFIG_KFENCE_NUM_OBJECTS * kfence_skip_covered_thresh) / 100;
+
+ return atomic_long_read(&counters[KFENCE_COUNTER_ALLOCATED]) > thresh;
+}
+
+static u32 get_alloc_stack_hash(unsigned long *stack_entries, size_t num_entries)
+{
+ num_entries = min(num_entries, UNIQUE_ALLOC_STACK_DEPTH);
+ num_entries = filter_irq_stacks(stack_entries, num_entries);
+ return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), stack_hash_seed);
+}
+
+/*
+ * Adds (or subtracts) count @val for allocation stack trace hash
+ * @alloc_stack_hash from Counting Bloom filter.
+ */
+static void alloc_covered_add(u32 alloc_stack_hash, int val)
+{
+ int i;
+
+ for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
+ atomic_add(val, &alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]);
+ alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
+ }
+}
+
+/*
+ * Returns true if the allocation stack trace hash @alloc_stack_hash is
+ * currently contained (non-zero count) in Counting Bloom filter.
+ */
+static bool alloc_covered_contains(u32 alloc_stack_hash)
+{
+ int i;
+
+ for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
+ if (!atomic_read(&alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]))
+ return false;
+ alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
+ }
+
+ return true;
+}
+
static bool kfence_protect(unsigned long addr)
{
return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true));
@@ -270,7 +350,8 @@ static __always_inline void for_each_canary(const struct kfence_metadata *meta,
}
static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp,
- unsigned long *stack_entries, size_t num_stack_entries)
+ unsigned long *stack_entries, size_t num_stack_entries,
+ u32 alloc_stack_hash)
{
struct kfence_metadata *meta = NULL;
unsigned long flags;
@@ -333,6 +414,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
/* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
WRITE_ONCE(meta->cache, cache);
meta->size = size;
+ meta->alloc_stack_hash = alloc_stack_hash;
+
for_each_canary(meta, set_canary_byte);
/* Set required struct page fields. */
@@ -345,6 +428,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
raw_spin_unlock_irqrestore(&meta->lock, flags);
+ alloc_covered_add(alloc_stack_hash, 1);
+
/* Memory initialization. */
/*
@@ -413,6 +498,8 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
raw_spin_unlock_irqrestore(&meta->lock, flags);
+ alloc_covered_add(meta->alloc_stack_hash, -1);
+
/* Protect to detect use-after-frees. */
kfence_protect((unsigned long)addr);
@@ -679,6 +766,7 @@ void __init kfence_init(void)
if (!kfence_sample_interval)
return;
+ stack_hash_seed = (u32)random_get_entropy();
if (!kfence_init_pool()) {
pr_err("%s failed\n", __func__);
return;
@@ -756,6 +844,7 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
{
unsigned long stack_entries[KFENCE_STACK_DEPTH];
size_t num_stack_entries;
+ u32 alloc_stack_hash;
/*
* Perform size check before switching kfence_allocation_gate, so that
@@ -798,7 +887,23 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 0);
- return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries);
+ /*
+ * Do expensive check for coverage of allocation in slow-path after
+ * allocation_gate has already become non-zero, even though it might
+ * mean not making any allocation within a given sample interval.
+ *
+ * This ensures reasonable allocation coverage when the pool is almost
+ * full, including avoiding long-lived allocations of the same source
+ * filling up the pool (e.g. pagecache allocations).
+ */
+ alloc_stack_hash = get_alloc_stack_hash(stack_entries, num_stack_entries);
+ if (should_skip_covered() && alloc_covered_contains(alloc_stack_hash)) {
+ atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_COVERED]);
+ return NULL;
+ }
+
+ return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries,
+ alloc_stack_hash);
}
size_t kfence_ksize(const void *addr)
@@ -87,6 +87,8 @@ struct kfence_metadata {
/* Allocation and free stack information. */
struct kfence_track alloc_track;
struct kfence_track free_track;
+ /* For updating alloc_covered on frees. */
+ u32 alloc_stack_hash;
};
extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];