# v2.4.0-test10-1-smp_pte_fix.diff
diff -ur v2.4.0-test10-pre1/include/asm-i386/pgtable-2level.h work-v2.4.0-test10-pre1/include/asm-i386/pgtable-2level.h
--- v2.4.0-test10-pre1/include/asm-i386/pgtable-2level.h Fri Dec 3 14:12:23 1999
+++ work-v2.4.0-test10-pre1/include/asm-i386/pgtable-2level.h Wed Oct 11 16:08:08 2000
@@ -55,4 +55,7 @@
return (pmd_t *) dir;
}
+#define __HAVE_ARCH_pte_xchg_clear
+#define pte_xchg_clear(xp) __pte(xchg(&(xp)->pte, 0))
+
#endif /* _I386_PGTABLE_2LEVEL_H */
diff -ur v2.4.0-test10-pre1/include/asm-i386/pgtable-3level.h work-v2.4.0-test10-pre1/include/asm-i386/pgtable-3level.h
--- v2.4.0-test10-pre1/include/asm-i386/pgtable-3level.h Mon Dec 6 19:19:13 1999
+++ work-v2.4.0-test10-pre1/include/asm-i386/pgtable-3level.h Wed Oct 11 16:14:40 2000
@@ -76,4 +76,17 @@
#define pmd_offset(dir, address) ((pmd_t *) pgd_page(*(dir)) + \
__pmd_offset(address))
+#define __HAVE_ARCH_pte_xchg_clear
+extern inline pte_t pte_xchg_clear(pte_t *ptep)
+{
+ long long res = pte_val(*ptep);
+__asm__ __volatile__ (
+ "1: cmpxchg8b (%1);
+ jnz 1b"
+ : "=A" (res)
+ :"D"(ptep), "0" (res), "b"(0), "c"(0)
+ : "memory");
+ return (pte_t){ res };
+}
+
#endif /* _I386_PGTABLE_3LEVEL_H */
diff -ur v2.4.0-test10-pre1/include/asm-i386/pgtable.h work-v2.4.0-test10-pre1/include/asm-i386/pgtable.h
--- v2.4.0-test10-pre1/include/asm-i386/pgtable.h Mon Oct 2 14:06:43 2000
+++ work-v2.4.0-test10-pre1/include/asm-i386/pgtable.h Wed Oct 11 17:44:04 2000
@@ -17,6 +17,10 @@
#include <asm/fixmap.h>
#include <linux/threads.h>
+#ifndef _I386_BITOPS_H
+#include <asm/bitops.h>
+#endif
+
extern pgd_t swapper_pg_dir[1024];
extern void paging_init(void);
@@ -145,6 +149,16 @@
* the page directory entry points directly to a 4MB-aligned block of
* memory.
*/
+#define _PAGE_BIT_PRESENT 0
+#define _PAGE_BIT_RW 1
+#define _PAGE_BIT_USER 2
+#define _PAGE_BIT_PWT 3
+#define _PAGE_BIT_PCD 4
+#define _PAGE_BIT_ACCESSED 5
+#define _PAGE_BIT_DIRTY 6
+#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page, Pentium+, if present.. */
+#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
+
#define _PAGE_PRESENT 0x001
#define _PAGE_RW 0x002
#define _PAGE_USER 0x004
@@ -234,6 +248,24 @@
#define pte_none(x) (!pte_val(x))
#define pte_present(x) (pte_val(x) & (_PAGE_PRESENT | _PAGE_PROTNONE))
#define pte_clear(xp) do { set_pte(xp, __pte(0)); } while (0)
+
+#define __HAVE_ARCH_pte_test_and_clear_dirty
+static inline int pte_test_and_clear_dirty(pte_t *page_table, pte_t pte)
+{
+ return test_and_clear_bit(_PAGE_BIT_DIRTY, page_table);
+}
+
+#define __HAVE_ARCH_pte_test_and_clear_young
+static inline int pte_test_and_clear_young(pte_t *page_table, pte_t pte)
+{
+ return test_and_clear_bit(_PAGE_BIT_ACCESSED, page_table);
+}
+
+#define __HAVE_ARCH_atomic_pte_wrprotect
+static inline void atomic_pte_wrprotect(pte_t *page_table, pte_t old_pte)
+{
+ clear_bit(_PAGE_BIT_RW, page_table);
+}
#define pmd_none(x) (!pmd_val(x))
#define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT)
diff -ur v2.4.0-test10-pre1/include/linux/mm.h work-v2.4.0-test10-pre1/include/linux/mm.h
--- v2.4.0-test10-pre1/include/linux/mm.h Tue Oct 3 13:40:38 2000
+++ work-v2.4.0-test10-pre1/include/linux/mm.h Wed Oct 11 17:44:38 2000
@@ -532,6 +532,42 @@
#define vmlist_modify_lock(mm) vmlist_access_lock(mm)
#define vmlist_modify_unlock(mm) vmlist_access_unlock(mm)
+#ifndef __HAVE_ARCH_pte_test_and_clear_young
+static inline int pte_test_and_clear_young(pte_t *page_table, pte_t pte)
+{
+ if (!pte_young(pte))
+ return 0;
+ set_pte(page_table, pte_mkold(pte));
+ return 1;
+}
+#endif
+
+#ifndef __HAVE_ARCH_pte_test_and_clear_dirty
+static inline int pte_test_and_clear_dirty(pte_t *page_table, pte_t pte)
+{
+ if (!pte_dirty(pte))
+ return 0;
+ set_pte(page_table, pte_mkclean(pte));
+ return 1;
+}
+#endif
+
+#ifndef __HAVE_ARCH_pte_xchg_clear
+static pte_t pte_xchg_clear(pte_t *page_table)
+{
+ pte_t pte = *page_table;
+ pte_clear(page_table);
+ return pte;
+}
+#endif
+
+#ifndef __HAVE_ARCH_atomic_pte_wrprotect
+static inline void atomic_pte_wrprotect(pte_t *page_table, pte_t old_pte)
+{
+ set_pte(page_table, pte_wrprotect(old_pte));
+}
+#endif
+
#endif /* __KERNEL__ */
#endif
diff -ur v2.4.0-test10-pre1/mm/filemap.c work-v2.4.0-test10-pre1/mm/filemap.c
--- v2.4.0-test10-pre1/mm/filemap.c Tue Oct 3 13:40:38 2000
+++ work-v2.4.0-test10-pre1/mm/filemap.c Wed Oct 11 18:26:35 2000
@@ -1475,39 +1475,47 @@
return retval;
}
+/* Called with mm->page_table_lock held to protect against other
+ * threads/the swapper from ripping pte's out from under us.
+ */
static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
unsigned long pgoff;
- pte_t pte = *ptep;
+ pte_t pte;
struct page *page;
int error;
+ pte = *ptep;
+
if (!(flags & MS_INVALIDATE)) {
if (!pte_present(pte))
- return 0;
- if (!pte_dirty(pte))
- return 0;
+ goto out;
+ if (!pte_test_and_clear_dirty(ptep, pte))
+ goto out;
flush_page_to_ram(pte_page(pte));
flush_cache_page(vma, address);
- set_pte(ptep, pte_mkclean(pte));
flush_tlb_page(vma, address);
page = pte_page(pte);
page_cache_get(page);
} else {
if (pte_none(pte))
- return 0;
+ goto out;
flush_cache_page(vma, address);
- pte_clear(ptep);
+
+ pte = pte_xchg_clear(ptep);
flush_tlb_page(vma, address);
+
if (!pte_present(pte)) {
+ spin_unlock(&vma->vm_mm->page_table_lock);
swap_free(pte_to_swp_entry(pte));
- return 0;
+ spin_lock(&vma->vm_mm->page_table_lock);
+ goto out;
}
page = pte_page(pte);
if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
page_cache_free(page);
- return 0;
+ goto out;
}
}
pgoff = (address - vma->vm_start) >> PAGE_CACHE_SHIFT;
@@ -1516,11 +1524,18 @@
printk("weirdness: pgoff=%lu index=%lu address=%lu vm_start=%lu vm_pgoff=%lu\n",
pgoff, page->index, address, vma->vm_start, vma->vm_pgoff);
}
+
+ spin_unlock(&vma->vm_mm->page_table_lock);
lock_page(page);
error = filemap_write_page(vma->vm_file, page, 1);
UnlockPage(page);
page_cache_free(page);
+
+ spin_lock(&vma->vm_mm->page_table_lock);
return error;
+
+out:
+ return 0;
}
static inline int filemap_sync_pte_range(pmd_t * pmd,
@@ -1590,6 +1605,11 @@
unsigned long end = address + size;
int error = 0;
+ /* Aquire the lock early; it may be possible to avoid dropping
+ * and reaquiring it repeatedly.
+ */
+ spin_lock(&vma->vm_mm->page_table_lock);
+
dir = pgd_offset(vma->vm_mm, address);
flush_cache_range(vma->vm_mm, end - size, end);
if (address >= end)
@@ -1600,6 +1620,9 @@
dir++;
} while (address && (address < end));
flush_tlb_range(vma->vm_mm, end - size, end);
+
+ spin_unlock(&vma->vm_mm->page_table_lock);
+
return error;
}
diff -ur v2.4.0-test10-pre1/mm/highmem.c work-v2.4.0-test10-pre1/mm/highmem.c
--- v2.4.0-test10-pre1/mm/highmem.c Tue Oct 10 16:57:31 2000
+++ work-v2.4.0-test10-pre1/mm/highmem.c Tue Oct 10 18:13:44 2000
@@ -130,10 +130,10 @@
if (pkmap_count[i] != 1)
continue;
pkmap_count[i] = 0;
- pte = pkmap_page_table[i];
+ //pte = pkmap_page_table[i]; pte_clear(pkmap_page_table+i);
+ pte = pte_xchg_clear(pkmap_page_table+i);
if (pte_none(pte))
BUG();
- pte_clear(pkmap_page_table+i);
page = pte_page(pte);
page->virtual = NULL;
}
diff -ur v2.4.0-test10-pre1/mm/memory.c work-v2.4.0-test10-pre1/mm/memory.c
--- v2.4.0-test10-pre1/mm/memory.c Tue Oct 3 13:40:38 2000
+++ work-v2.4.0-test10-pre1/mm/memory.c Wed Oct 11 18:30:17 2000
@@ -215,30 +215,30 @@
/* copy_one_pte */
if (pte_none(pte))
- goto cont_copy_pte_range;
+ goto cont_copy_pte_range_noset;
if (!pte_present(pte)) {
swap_duplicate(pte_to_swp_entry(pte));
- set_pte(dst_pte, pte);
goto cont_copy_pte_range;
}
ptepage = pte_page(pte);
if ((!VALID_PAGE(ptepage)) ||
- PageReserved(ptepage)) {
- set_pte(dst_pte, pte);
+ PageReserved(ptepage))
goto cont_copy_pte_range;
- }
+
/* If it's a COW mapping, write protect it both in the parent and the child */
if (cow) {
- pte = pte_wrprotect(pte);
- set_pte(src_pte, pte);
+ atomic_pte_wrprotect(src_pte, pte);
+ pte = *src_pte;
}
+
/* If it's a shared mapping, mark it clean in the child */
if (vma->vm_flags & VM_SHARED)
pte = pte_mkclean(pte);
- set_pte(dst_pte, pte_mkold(pte));
+ pte = pte_mkold(pte);
get_page(ptepage);
-
-cont_copy_pte_range: address += PAGE_SIZE;
+
+cont_copy_pte_range: set_pte(dst_pte, pte);
+cont_copy_pte_range_noset: address += PAGE_SIZE;
if (address >= end)
goto out;
src_pte++;
@@ -306,10 +306,9 @@
pte_t page;
if (!size)
break;
- page = *pte;
+ page = pte_xchg_clear(pte);
pte++;
size--;
- pte_clear(pte-1);
if (pte_none(page))
continue;
freed += free_pte(page);
@@ -712,8 +711,8 @@
end = PMD_SIZE;
do {
struct page *page;
- pte_t oldpage = *pte;
- pte_clear(pte);
+ pte_t oldpage;
+ oldpage = pte_xchg_clear(pte);
page = virt_to_page(__va(phys_addr));
if ((!VALID_PAGE(page)) || PageReserved(page))
@@ -746,6 +745,7 @@
return 0;
}
+/* Note: this is only safe if the mm semaphore is held when called. */
int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot)
{
int error = 0;
diff -ur v2.4.0-test10-pre1/mm/mremap.c work-v2.4.0-test10-pre1/mm/mremap.c
--- v2.4.0-test10-pre1/mm/mremap.c Tue Oct 3 13:40:38 2000
+++ work-v2.4.0-test10-pre1/mm/mremap.c Wed Oct 11 02:38:41 2000
@@ -63,14 +63,14 @@
pte_t pte;
spin_lock(&mm->page_table_lock);
- pte = *src;
+ pte = pte_xchg_clear(src);
if (!pte_none(pte)) {
- error++;
- if (dst) {
- pte_clear(src);
- set_pte(dst, pte);
- error--;
+ if (!dst) {
+ /* No dest? We must put it back. */
+ dst = src;
+ error++;
}
+ set_pte(dst, pte);
}
spin_unlock(&mm->page_table_lock);
return error;
diff -ur v2.4.0-test10-pre1/mm/vmalloc.c work-v2.4.0-test10-pre1/mm/vmalloc.c
--- v2.4.0-test10-pre1/mm/vmalloc.c Tue Oct 3 13:40:38 2000
+++ work-v2.4.0-test10-pre1/mm/vmalloc.c Wed Oct 11 16:38:21 2000
@@ -34,14 +34,15 @@
if (end > PMD_SIZE)
end = PMD_SIZE;
do {
- pte_t page = *pte;
- pte_clear(pte);
+ pte_t page;
+ page = pte_xchg_clear(pte);
address += PAGE_SIZE;
pte++;
if (pte_none(page))
continue;
if (pte_present(page)) {
struct page *ptpage = pte_page(page);
+ /* FIXME: i am an ugly little race condition */
if (VALID_PAGE(ptpage) && (!PageReserved(ptpage)))
__free_page(ptpage);
continue;
diff -ur v2.4.0-test10-pre1/mm/vmscan.c work-v2.4.0-test10-pre1/mm/vmscan.c
--- v2.4.0-test10-pre1/mm/vmscan.c Tue Oct 10 16:57:31 2000
+++ work-v2.4.0-test10-pre1/mm/vmscan.c Wed Oct 11 18:17:17 2000
@@ -55,8 +55,7 @@
onlist = PageActive(page);
/* Don't look at this pte if it's been accessed recently. */
- if (pte_young(pte)) {
- set_pte(page_table, pte_mkold(pte));
+ if (pte_test_and_clear_young(page_table, pte)) {
if (onlist) {
/*
* Transfer the "accessed" bit from the page
@@ -99,6 +98,10 @@
if (PageSwapCache(page)) {
entry.val = page->index;
swap_duplicate(entry);
+ if (pte_dirty(pte))
+ BUG();
+ if (pte_write(pte))
+ BUG();
set_pte(page_table, swp_entry_to_pte(entry));
drop_pte:
UnlockPage(page);
@@ -109,6 +112,13 @@
goto out_failed;
}
+ /* From this point on, the odds are that we're going to
+ * nuke this pte, so read and clear the pte. This hook
+ * is needed on CPUs which update the accessed and dirty
+ * bits in hardware.
+ */
+ pte = pte_xchg_clear(page_table);
+
/*
* Is it a clean page? Then it must be recoverable
* by just paging it in again, and we can just drop
@@ -124,7 +134,6 @@
*/
if (!pte_dirty(pte)) {
flush_cache_page(vma, address);
- pte_clear(page_table);
goto drop_pte;
}
@@ -134,7 +143,7 @@
* locks etc.
*/
if (!(gfp_mask & __GFP_IO))
- goto out_unlock;
+ goto out_unlock_restore;
/*
* Don't do any of the expensive stuff if
@@ -143,7 +152,7 @@
if (page->zone->free_pages + page->zone->inactive_clean_pages
+ page->zone->inactive_dirty_pages
> page->zone->pages_high + inactive_target)
- goto out_unlock;
+ goto out_unlock_restore;
/*
* Ok, it's really dirty. That means that
@@ -169,7 +178,7 @@
int error;
struct file *file = vma->vm_file;
if (file) get_file(file);
- pte_clear(page_table);
+
mm->rss--;
flush_tlb_page(vma, address);
vmlist_access_unlock(mm);
@@ -191,10 +200,12 @@
*/
entry = get_swap_page();
if (!entry.val)
- goto out_unlock; /* No swap space left */
+ goto out_unlock_restore; /* No swap space left */
- if (!(page = prepare_highmem_swapout(page)))
+ if (!(page = prepare_highmem_swapout(page))) {
+ set_pte(page_table, pte);
goto out_swap_free;
+ }
swap_duplicate(entry); /* One for the process, one for the swap cache */
@@ -218,7 +229,8 @@
swap_free(entry);
out_failed:
return 0;
-out_unlock:
+out_unlock_restore:
+ set_pte(page_table, pte);
UnlockPage(page);
return 0;
}