# v2.4.0-test10-1-smp_pte_fix.diff
diff -ur v2.4.0-test10-pre1/include/asm-i386/pgtable-2level.h work-v2.4.0-test10-pre1/include/asm-i386/pgtable-2level.h
--- v2.4.0-test10-pre1/include/asm-i386/pgtable-2level.h	Fri Dec  3 14:12:23 1999
+++ work-v2.4.0-test10-pre1/include/asm-i386/pgtable-2level.h	Wed Oct 11 16:08:08 2000
@@ -55,4 +55,7 @@
 	return (pmd_t *) dir;
 }
 
+#define __HAVE_ARCH_pte_xchg_clear
+#define pte_xchg_clear(xp)	__pte(xchg(&(xp)->pte, 0))
+
 #endif /* _I386_PGTABLE_2LEVEL_H */
diff -ur v2.4.0-test10-pre1/include/asm-i386/pgtable-3level.h work-v2.4.0-test10-pre1/include/asm-i386/pgtable-3level.h
--- v2.4.0-test10-pre1/include/asm-i386/pgtable-3level.h	Mon Dec  6 19:19:13 1999
+++ work-v2.4.0-test10-pre1/include/asm-i386/pgtable-3level.h	Wed Oct 11 16:14:40 2000
@@ -76,4 +76,17 @@
 #define pmd_offset(dir, address) ((pmd_t *) pgd_page(*(dir)) + \
 			__pmd_offset(address))
 
+#define __HAVE_ARCH_pte_xchg_clear
+extern inline pte_t pte_xchg_clear(pte_t *ptep)
+{
+	long long res = pte_val(*ptep);
+__asm__ __volatile__ (
+        "1: cmpxchg8b (%1);
+                jnz 1b"
+        : "=A" (res)
+	:"D"(ptep), "0" (res), "b"(0), "c"(0)
+        : "memory");
+	return (pte_t){ res };
+}
+
 #endif /* _I386_PGTABLE_3LEVEL_H */
diff -ur v2.4.0-test10-pre1/include/asm-i386/pgtable.h work-v2.4.0-test10-pre1/include/asm-i386/pgtable.h
--- v2.4.0-test10-pre1/include/asm-i386/pgtable.h	Mon Oct  2 14:06:43 2000
+++ work-v2.4.0-test10-pre1/include/asm-i386/pgtable.h	Wed Oct 11 17:44:04 2000
@@ -17,6 +17,10 @@
 #include <asm/fixmap.h>
 #include <linux/threads.h>
 
+#ifndef _I386_BITOPS_H
+#include <asm/bitops.h>
+#endif
+
 extern pgd_t swapper_pg_dir[1024];
 extern void paging_init(void);
 
@@ -145,6 +149,16 @@
  * the page directory entry points directly to a 4MB-aligned block of
  * memory. 
  */
+#define _PAGE_BIT_PRESENT	0
+#define _PAGE_BIT_RW		1
+#define _PAGE_BIT_USER		2
+#define _PAGE_BIT_PWT		3
+#define _PAGE_BIT_PCD		4
+#define _PAGE_BIT_ACCESSED	5
+#define _PAGE_BIT_DIRTY		6
+#define _PAGE_BIT_PSE		7	/* 4 MB (or 2MB) page, Pentium+, if present.. */
+#define _PAGE_BIT_GLOBAL	8	/* Global TLB entry PPro+ */
+
 #define _PAGE_PRESENT	0x001
 #define _PAGE_RW	0x002
 #define _PAGE_USER	0x004
@@ -234,6 +248,24 @@
 #define pte_none(x)	(!pte_val(x))
 #define pte_present(x)	(pte_val(x) & (_PAGE_PRESENT | _PAGE_PROTNONE))
 #define pte_clear(xp)	do { set_pte(xp, __pte(0)); } while (0)
+
+#define __HAVE_ARCH_pte_test_and_clear_dirty
+static inline int pte_test_and_clear_dirty(pte_t *page_table, pte_t pte)
+{
+	return test_and_clear_bit(_PAGE_BIT_DIRTY, page_table);
+}
+
+#define __HAVE_ARCH_pte_test_and_clear_young
+static inline int pte_test_and_clear_young(pte_t *page_table, pte_t pte)
+{
+	return test_and_clear_bit(_PAGE_BIT_ACCESSED, page_table);
+}
+
+#define __HAVE_ARCH_atomic_pte_wrprotect
+static inline void atomic_pte_wrprotect(pte_t *page_table, pte_t old_pte)
+{
+	clear_bit(_PAGE_BIT_RW, page_table);
+}
 
 #define pmd_none(x)	(!pmd_val(x))
 #define pmd_present(x)	(pmd_val(x) & _PAGE_PRESENT)
diff -ur v2.4.0-test10-pre1/include/linux/mm.h work-v2.4.0-test10-pre1/include/linux/mm.h
--- v2.4.0-test10-pre1/include/linux/mm.h	Tue Oct  3 13:40:38 2000
+++ work-v2.4.0-test10-pre1/include/linux/mm.h	Wed Oct 11 17:44:38 2000
@@ -532,6 +532,42 @@
 #define vmlist_modify_lock(mm)		vmlist_access_lock(mm)
 #define vmlist_modify_unlock(mm)	vmlist_access_unlock(mm)
 
+#ifndef __HAVE_ARCH_pte_test_and_clear_young
+static inline int pte_test_and_clear_young(pte_t *page_table, pte_t pte)
+{
+	if (!pte_young(pte))
+		return 0;
+	set_pte(page_table, pte_mkold(pte));
+	return 1;
+}
+#endif
+
+#ifndef __HAVE_ARCH_pte_test_and_clear_dirty
+static inline int pte_test_and_clear_dirty(pte_t *page_table, pte_t pte)
+{
+	if (!pte_dirty(pte))
+		return 0;
+	set_pte(page_table, pte_mkclean(pte));
+	return 1;
+}
+#endif
+
+#ifndef __HAVE_ARCH_pte_xchg_clear
+static pte_t pte_xchg_clear(pte_t *page_table)
+{
+	pte_t pte = *page_table;
+	pte_clear(page_table);
+	return pte;
+}
+#endif
+
+#ifndef __HAVE_ARCH_atomic_pte_wrprotect
+static inline void atomic_pte_wrprotect(pte_t *page_table, pte_t old_pte)
+{
+	set_pte(page_table, pte_wrprotect(old_pte));
+}
+#endif
+
 #endif /* __KERNEL__ */
 
 #endif
diff -ur v2.4.0-test10-pre1/mm/filemap.c work-v2.4.0-test10-pre1/mm/filemap.c
--- v2.4.0-test10-pre1/mm/filemap.c	Tue Oct  3 13:40:38 2000
+++ work-v2.4.0-test10-pre1/mm/filemap.c	Wed Oct 11 18:26:35 2000
@@ -1475,39 +1475,47 @@
 	return retval;
 }
 
+/* Called with mm->page_table_lock held to protect against other
+ * threads/the swapper from ripping pte's out from under us.
+ */
 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
 	unsigned long address, unsigned int flags)
 {
 	unsigned long pgoff;
-	pte_t pte = *ptep;
+	pte_t pte;
 	struct page *page;
 	int error;
 
+	pte = *ptep;
+
 	if (!(flags & MS_INVALIDATE)) {
 		if (!pte_present(pte))
-			return 0;
-		if (!pte_dirty(pte))
-			return 0;
+			goto out;
+		if (!pte_test_and_clear_dirty(ptep, pte))
+			goto out;
 		flush_page_to_ram(pte_page(pte));
 		flush_cache_page(vma, address);
-		set_pte(ptep, pte_mkclean(pte));
 		flush_tlb_page(vma, address);
 		page = pte_page(pte);
 		page_cache_get(page);
 	} else {
 		if (pte_none(pte))
-			return 0;
+			goto out;
 		flush_cache_page(vma, address);
-		pte_clear(ptep);
+
+		pte = pte_xchg_clear(ptep);
 		flush_tlb_page(vma, address);
+
 		if (!pte_present(pte)) {
+			spin_unlock(&vma->vm_mm->page_table_lock);
 			swap_free(pte_to_swp_entry(pte));
-			return 0;
+			spin_lock(&vma->vm_mm->page_table_lock);
+			goto out;
 		}
 		page = pte_page(pte);
 		if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
 			page_cache_free(page);
-			return 0;
+			goto out;
 		}
 	}
 	pgoff = (address - vma->vm_start) >> PAGE_CACHE_SHIFT;
@@ -1516,11 +1524,18 @@
 		printk("weirdness: pgoff=%lu index=%lu address=%lu vm_start=%lu vm_pgoff=%lu\n",
 			pgoff, page->index, address, vma->vm_start, vma->vm_pgoff);
 	}
+
+	spin_unlock(&vma->vm_mm->page_table_lock);
 	lock_page(page);
 	error = filemap_write_page(vma->vm_file, page, 1);
 	UnlockPage(page);
 	page_cache_free(page);
+
+	spin_lock(&vma->vm_mm->page_table_lock);
 	return error;
+
+out:
+	return 0;
 }
 
 static inline int filemap_sync_pte_range(pmd_t * pmd,
@@ -1590,6 +1605,11 @@
 	unsigned long end = address + size;
 	int error = 0;
 
+	/* Aquire the lock early; it may be possible to avoid dropping
+	 * and reaquiring it repeatedly.
+	 */
+	spin_lock(&vma->vm_mm->page_table_lock);
+
 	dir = pgd_offset(vma->vm_mm, address);
 	flush_cache_range(vma->vm_mm, end - size, end);
 	if (address >= end)
@@ -1600,6 +1620,9 @@
 		dir++;
 	} while (address && (address < end));
 	flush_tlb_range(vma->vm_mm, end - size, end);
+
+	spin_unlock(&vma->vm_mm->page_table_lock);
+
 	return error;
 }
 
diff -ur v2.4.0-test10-pre1/mm/highmem.c work-v2.4.0-test10-pre1/mm/highmem.c
--- v2.4.0-test10-pre1/mm/highmem.c	Tue Oct 10 16:57:31 2000
+++ work-v2.4.0-test10-pre1/mm/highmem.c	Tue Oct 10 18:13:44 2000
@@ -130,10 +130,10 @@
 		if (pkmap_count[i] != 1)
 			continue;
 		pkmap_count[i] = 0;
-		pte = pkmap_page_table[i];
+		//pte = pkmap_page_table[i]; pte_clear(pkmap_page_table+i);
+		pte = pte_xchg_clear(pkmap_page_table+i);
 		if (pte_none(pte))
 			BUG();
-		pte_clear(pkmap_page_table+i);
 		page = pte_page(pte);
 		page->virtual = NULL;
 	}
diff -ur v2.4.0-test10-pre1/mm/memory.c work-v2.4.0-test10-pre1/mm/memory.c
--- v2.4.0-test10-pre1/mm/memory.c	Tue Oct  3 13:40:38 2000
+++ work-v2.4.0-test10-pre1/mm/memory.c	Wed Oct 11 18:30:17 2000
@@ -215,30 +215,30 @@
 				/* copy_one_pte */
 
 				if (pte_none(pte))
-					goto cont_copy_pte_range;
+					goto cont_copy_pte_range_noset;
 				if (!pte_present(pte)) {
 					swap_duplicate(pte_to_swp_entry(pte));
-					set_pte(dst_pte, pte);
 					goto cont_copy_pte_range;
 				}
 				ptepage = pte_page(pte);
 				if ((!VALID_PAGE(ptepage)) || 
-				    PageReserved(ptepage)) {
-					set_pte(dst_pte, pte);
+				    PageReserved(ptepage))
 					goto cont_copy_pte_range;
-				}
+
 				/* If it's a COW mapping, write protect it both in the parent and the child */
 				if (cow) {
-					pte = pte_wrprotect(pte);
-					set_pte(src_pte, pte);
+					atomic_pte_wrprotect(src_pte, pte);
+					pte = *src_pte;
 				}
+
 				/* If it's a shared mapping, mark it clean in the child */
 				if (vma->vm_flags & VM_SHARED)
 					pte = pte_mkclean(pte);
-				set_pte(dst_pte, pte_mkold(pte));
+				pte = pte_mkold(pte);
 				get_page(ptepage);
-			
-cont_copy_pte_range:		address += PAGE_SIZE;
+
+cont_copy_pte_range:		set_pte(dst_pte, pte);
+cont_copy_pte_range_noset:	address += PAGE_SIZE;
 				if (address >= end)
 					goto out;
 				src_pte++;
@@ -306,10 +306,9 @@
 		pte_t page;
 		if (!size)
 			break;
-		page = *pte;
+		page = pte_xchg_clear(pte);
 		pte++;
 		size--;
-		pte_clear(pte-1);
 		if (pte_none(page))
 			continue;
 		freed += free_pte(page);
@@ -712,8 +711,8 @@
 		end = PMD_SIZE;
 	do {
 		struct page *page;
-		pte_t oldpage = *pte;
-		pte_clear(pte);
+		pte_t oldpage;
+		oldpage = pte_xchg_clear(pte);
 
 		page = virt_to_page(__va(phys_addr));
 		if ((!VALID_PAGE(page)) || PageReserved(page))
@@ -746,6 +745,7 @@
 	return 0;
 }
 
+/*  Note: this is only safe if the mm semaphore is held when called. */
 int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot)
 {
 	int error = 0;
diff -ur v2.4.0-test10-pre1/mm/mremap.c work-v2.4.0-test10-pre1/mm/mremap.c
--- v2.4.0-test10-pre1/mm/mremap.c	Tue Oct  3 13:40:38 2000
+++ work-v2.4.0-test10-pre1/mm/mremap.c	Wed Oct 11 02:38:41 2000
@@ -63,14 +63,14 @@
 	pte_t pte;
 
 	spin_lock(&mm->page_table_lock);
-	pte = *src;
+	pte = pte_xchg_clear(src);
 	if (!pte_none(pte)) {
-		error++;
-		if (dst) {
-			pte_clear(src);
-			set_pte(dst, pte);
-			error--;
+		if (!dst) {
+			/* No dest?  We must put it back. */
+			dst = src;
+			error++;
 		}
+		set_pte(dst, pte);
 	}
 	spin_unlock(&mm->page_table_lock);
 	return error;
diff -ur v2.4.0-test10-pre1/mm/vmalloc.c work-v2.4.0-test10-pre1/mm/vmalloc.c
--- v2.4.0-test10-pre1/mm/vmalloc.c	Tue Oct  3 13:40:38 2000
+++ work-v2.4.0-test10-pre1/mm/vmalloc.c	Wed Oct 11 16:38:21 2000
@@ -34,14 +34,15 @@
 	if (end > PMD_SIZE)
 		end = PMD_SIZE;
 	do {
-		pte_t page = *pte;
-		pte_clear(pte);
+		pte_t page;
+		page = pte_xchg_clear(pte);
 		address += PAGE_SIZE;
 		pte++;
 		if (pte_none(page))
 			continue;
 		if (pte_present(page)) {
 			struct page *ptpage = pte_page(page);
+			/* FIXME: i am an ugly little race condition */
 			if (VALID_PAGE(ptpage) && (!PageReserved(ptpage)))
 				__free_page(ptpage);
 			continue;
diff -ur v2.4.0-test10-pre1/mm/vmscan.c work-v2.4.0-test10-pre1/mm/vmscan.c
--- v2.4.0-test10-pre1/mm/vmscan.c	Tue Oct 10 16:57:31 2000
+++ work-v2.4.0-test10-pre1/mm/vmscan.c	Wed Oct 11 18:17:17 2000
@@ -55,8 +55,7 @@
 
 	onlist = PageActive(page);
 	/* Don't look at this pte if it's been accessed recently. */
-	if (pte_young(pte)) {
-		set_pte(page_table, pte_mkold(pte));
+	if (pte_test_and_clear_young(page_table, pte)) {
 		if (onlist) {
 			/*
 			 * Transfer the "accessed" bit from the page
@@ -99,6 +98,10 @@
 	if (PageSwapCache(page)) {
 		entry.val = page->index;
 		swap_duplicate(entry);
+		if (pte_dirty(pte))
+			BUG();
+		if (pte_write(pte))
+			BUG();
 		set_pte(page_table, swp_entry_to_pte(entry));
 drop_pte:
 		UnlockPage(page);
@@ -109,6 +112,13 @@
 		goto out_failed;
 	}
 
+	/* From this point on, the odds are that we're going to
+	 * nuke this pte, so read and clear the pte.  This hook
+	 * is needed on CPUs which update the accessed and dirty
+	 * bits in hardware.
+	 */
+	pte = pte_xchg_clear(page_table);
+
 	/*
 	 * Is it a clean page? Then it must be recoverable
 	 * by just paging it in again, and we can just drop
@@ -124,7 +134,6 @@
 	 */
 	if (!pte_dirty(pte)) {
 		flush_cache_page(vma, address);
-		pte_clear(page_table);
 		goto drop_pte;
 	}
 
@@ -134,7 +143,7 @@
 	 * locks etc.
 	 */
 	if (!(gfp_mask & __GFP_IO))
-		goto out_unlock;
+		goto out_unlock_restore;
 
 	/*
 	 * Don't do any of the expensive stuff if
@@ -143,7 +152,7 @@
 	if (page->zone->free_pages + page->zone->inactive_clean_pages
 					+ page->zone->inactive_dirty_pages
 		      	> page->zone->pages_high + inactive_target)
-		goto out_unlock;
+		goto out_unlock_restore;
 
 	/*
 	 * Ok, it's really dirty. That means that
@@ -169,7 +178,7 @@
 		int error;
 		struct file *file = vma->vm_file;
 		if (file) get_file(file);
-		pte_clear(page_table);
+
 		mm->rss--;
 		flush_tlb_page(vma, address);
 		vmlist_access_unlock(mm);
@@ -191,10 +200,12 @@
 	 */
 	entry = get_swap_page();
 	if (!entry.val)
-		goto out_unlock; /* No swap space left */
+		goto out_unlock_restore; /* No swap space left */
 
-	if (!(page = prepare_highmem_swapout(page)))
+	if (!(page = prepare_highmem_swapout(page))) {
+		set_pte(page_table, pte);
 		goto out_swap_free;
+	}
 
 	swap_duplicate(entry);	/* One for the process, one for the swap cache */
 
@@ -218,7 +229,8 @@
 	swap_free(entry);
 out_failed:
 	return 0;
-out_unlock:
+out_unlock_restore:
+	set_pte(page_table, pte);
 	UnlockPage(page);
 	return 0;
 }