rpms/kernel/devel linux-2.6-ksm-kvm.patch, NONE, 1.1 linux-2.6-ksm.patch, NONE, 1.1 config-generic, 1.310, 1.311 kernel.spec, 1.1650, 1.1651

Thu Jul 23 18:25:23 UTC 2009

Author: jforbes

Update of /cvs/pkgs/rpms/kernel/devel
In directory cvs1.fedora.phx.redhat.com:/tmp/cvs-serv28593

Modified Files:
	config-generic kernel.spec 
Added Files:
	linux-2.6-ksm-kvm.patch linux-2.6-ksm.patch 
Log Message:
Add KSM support

linux-2.6-ksm-kvm.patch:
 arch/x86/include/asm/kvm_host.h |    1 
 arch/x86/kvm/mmu.c              |   89 ++++++++++++++++++++++++++++++++--------
 arch/x86/kvm/paging_tmpl.h      |   11 +++-
 virt/kvm/kvm_main.c             |   14 ++++++
 4 files changed, 96 insertions(+), 19 deletions(-)

--- NEW FILE linux-2.6-ksm-kvm.patch ---
When using mmu notifiers, we are allowed to remove the page count
reference tooken by get_user_pages to a specific page that is mapped
inside the shadow page tables.

This is needed so we can balance the pagecount against mapcount
checking.

(Right now kvm increase the pagecount and does not increase the
mapcount when mapping page into shadow page table entry,
so when comparing pagecount against mapcount, you have no
reliable result.)

add SPTE_HOST_WRITEABLE flag notify that the host physical page we are
pointing to from the spte is write protected, and therefore we cant
change its access to be write unless we run get_user_pages(write = 1).

(this is needed for change_pte support in kvm)

support for change_pte mmu notifiers is needed for kvm if it want ksm to
directly map pages into its shadow page tables.

Signed-off-by: Izik Eidus <ieidus at redhat.com>
Signed-off-by: Justin M. Forbes <jforbes at redhat.com>
---
--- linux-2.6.30.x86_64/arch/x86/include/asm/kvm_host.h	2009-07-23 11:24:43.000000000 -0500
+++ linux-2.6.30.x86_64-ksm/arch/x86/include/asm/kvm_host.h	2009-07-23 12:43:57.000000000 -0500
@@ -796,5 +796,6 @@ asmlinkage void kvm_handle_fault_on_rebo
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
 int kvm_age_hva(struct kvm *kvm, unsigned long hva);
 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
 
 #endif /* _ASM_X86_KVM_HOST_H */
--- linux-2.6.30.x86_64/arch/x86/kvm/mmu.c	2009-07-23 11:24:43.000000000 -0500
+++ linux-2.6.30.x86_64-ksm/arch/x86/kvm/mmu.c	2009-07-23 12:42:36.000000000 -0500
@@ -139,6 +139,8 @@ module_param(oos_shadow, bool, 0644);
 #define ACC_USER_MASK    PT_USER_MASK
 #define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
 
+#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
+
 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
 
 struct kvm_rmap_desc {
@@ -254,6 +256,11 @@ static pfn_t spte_to_pfn(u64 pte)
 	return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
 }
 
+static pte_t ptep_val(pte_t *ptep)
+{
+	return *ptep;
+}
+
 static gfn_t pse36_gfn_delta(u32 gpte)
 {
 	int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
@@ -566,9 +573,7 @@ static void rmap_remove(struct kvm *kvm,
 	if (*spte & shadow_accessed_mask)
 		kvm_set_pfn_accessed(pfn);
 	if (is_writeble_pte(*spte))
-		kvm_release_pfn_dirty(pfn);
-	else
-		kvm_release_pfn_clean(pfn);
+		kvm_set_pfn_dirty(pfn);
 	rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte));
 	if (!*rmapp) {
 		printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
@@ -677,7 +682,8 @@ static int rmap_write_protect(struct kvm
 	return write_protected;
 }
 
-static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
+static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
+			   unsigned long data)
 {
 	u64 *spte;
 	int need_tlb_flush = 0;
@@ -692,8 +698,48 @@ static int kvm_unmap_rmapp(struct kvm *k
 	return need_tlb_flush;
 }
 
+static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
+			     unsigned long data)
+{
+	int need_flush = 0;
+	u64 *spte, new_spte;
+	pte_t *ptep = (pte_t *)data;
+	pfn_t new_pfn;
+
+	new_pfn = pte_pfn(ptep_val(ptep));
+	spte = rmap_next(kvm, rmapp, NULL);
+	while (spte) {
+		BUG_ON(!is_shadow_present_pte(*spte));
+		rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
+		need_flush = 1;
+		if (pte_write(ptep_val(ptep))) {
+			rmap_remove(kvm, spte);
+			set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+			spte = rmap_next(kvm, rmapp, NULL);
+		} else {
+			new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
+			new_spte |= new_pfn << PAGE_SHIFT;
+
+			if (!pte_write(ptep_val(ptep))) {
+				new_spte &= ~PT_WRITABLE_MASK;
+				new_spte &= ~SPTE_HOST_WRITEABLE;
+				if (is_writeble_pte(*spte))
+					kvm_set_pfn_dirty(spte_to_pfn(*spte));
+			}
+			set_shadow_pte(spte, new_spte);
+			spte = rmap_next(kvm, rmapp, spte);
+		}
+	}
+	if (need_flush)
+		kvm_flush_remote_tlbs(kvm);
+
+	return 0;
+}
+
 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
-			  int (*handler)(struct kvm *kvm, unsigned long *rmapp))
+			  unsigned long data,
+			  int (*handler)(struct kvm *kvm, unsigned long *rmapp,
+					 unsigned long data))
 {
 	int i;
 	int retval = 0;
@@ -714,11 +760,13 @@ static int kvm_handle_hva(struct kvm *kv
 		end = start + (memslot->npages << PAGE_SHIFT);
 		if (hva >= start && hva < end) {
 			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
-			retval |= handler(kvm, &memslot->rmap[gfn_offset]);
+			retval |= handler(kvm, &memslot->rmap[gfn_offset],
+					  data);
 			retval |= handler(kvm,
 					  &memslot->lpage_info[
 						  gfn_offset /
-						  KVM_PAGES_PER_HPAGE].rmap_pde);
+						  KVM_PAGES_PER_HPAGE].rmap_pde,
+						  data);
 		}
 	}
 
@@ -727,10 +775,16 @@ static int kvm_handle_hva(struct kvm *kv
 
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
 {
-	return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
+	return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
+}
+
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+	kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
 }
 
-static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
+static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+			 unsigned long data)
 {
 	u64 *spte;
 	int young = 0;
@@ -756,7 +810,7 @@ static int kvm_age_rmapp(struct kvm *kvm
 
 int kvm_age_hva(struct kvm *kvm, unsigned long hva)
 {
-	return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
+	return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
 }
 
 #ifdef MMU_DEBUG
@@ -1665,7 +1719,7 @@ static int set_spte(struct kvm_vcpu *vcp
 		    unsigned pte_access, int user_fault,
 		    int write_fault, int dirty, int largepage,
 		    gfn_t gfn, pfn_t pfn, bool speculative,
-		    bool can_unsync)
+		    bool can_unsync, bool reset_host_protection)
 {
 	u64 spte;
 	int ret = 0;
@@ -1723,6 +1777,8 @@ static int set_spte(struct kvm_vcpu *vcp
 				spte &= ~PT_WRITABLE_MASK;
 		}
 	}
+	if (reset_host_protection)
+		spte |= SPTE_HOST_WRITEABLE;
 
 	if (pte_access & ACC_WRITE_MASK)
 		mark_page_dirty(vcpu->kvm, gfn);
@@ -1736,7 +1792,8 @@ static void mmu_set_spte(struct kvm_vcpu
 			 unsigned pt_access, unsigned pte_access,
 			 int user_fault, int write_fault, int dirty,
 			 int *ptwrite, int largepage, gfn_t gfn,
-			 pfn_t pfn, bool speculative)
+			 pfn_t pfn, bool speculative,
+             bool reset_host_protection)
 {
 	int was_rmapped = 0;
 	int was_writeble = is_writeble_pte(*shadow_pte);
@@ -1765,7 +1822,8 @@ static void mmu_set_spte(struct kvm_vcpu
 			was_rmapped = 1;
 	}
 	if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
-		      dirty, largepage, gfn, pfn, speculative, true)) {
+		      dirty, largepage, gfn, pfn, speculative, true,
+              reset_host_protection)) {
 		if (write_fault)
 			*ptwrite = 1;
 		kvm_x86_ops->tlb_flush(vcpu);
@@ -1782,8 +1840,7 @@ static void mmu_set_spte(struct kvm_vcpu
 	page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
 	if (!was_rmapped) {
 		rmap_add(vcpu, shadow_pte, gfn, largepage);
-		if (!is_rmap_pte(*shadow_pte))
-			kvm_release_pfn_clean(pfn);
+		kvm_release_pfn_clean(pfn);
 	} else {
 		if (was_writeble)
 			kvm_release_pfn_dirty(pfn);
@@ -1813,7 +1870,7 @@ static int __direct_map(struct kvm_vcpu 
 		    || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) {
 			mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
 				     0, write, 1, &pt_write,
-				     largepage, gfn, pfn, false);
+				     largepage, gfn, pfn, false, true);
 			++vcpu->stat.pf_fixed;
 			break;
 		}
--- linux-2.6.30.x86_64/arch/x86/kvm/paging_tmpl.h	2009-07-23 11:24:43.000000000 -0500
+++ linux-2.6.30.x86_64-ksm/arch/x86/kvm/paging_tmpl.h	2009-07-23 12:41:27.000000000 -0500
@@ -266,9 +266,13 @@ static void FNAME(update_pte)(struct kvm
 	if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq))
 		return;
 	kvm_get_pfn(pfn);
+    /*
+     * we call mmu_set_spte() with reset_host_protection = true beacuse that
+     * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
+     */ 
 	mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
 		     gpte & PT_DIRTY_MASK, NULL, largepage,
-		     gpte_to_gfn(gpte), pfn, true);
+		     gpte_to_gfn(gpte), pfn, true, true);
 }
 
 /*
@@ -302,7 +306,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
 				     user_fault, write_fault,
 				     gw->ptes[gw->level-1] & PT_DIRTY_MASK,
 				     ptwrite, largepage,
-				     gw->gfn, pfn, false);
+				     gw->gfn, pfn, false, true);
 			break;
 		}
 
@@ -552,6 +556,7 @@ static void FNAME(prefetch_page)(struct 
 static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
 	int i, offset, nr_present;
+        bool reset_host_protection = 1;
 
 	offset = nr_present = 0;
 
@@ -591,7 +596,7 @@ static int FNAME(sync_page)(struct kvm_v
 		pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
 		set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
 			 is_dirty_pte(gpte), 0, gfn,
-			 spte_to_pfn(sp->spt[i]), true, false);
+			 spte_to_pfn(sp->spt[i]), true, false, reset_host_protection);
 	}
 
 	return !nr_present;
--- linux-2.6.30.x86_64/virt/kvm/kvm_main.c	2009-07-23 11:24:45.000000000 -0500
+++ linux-2.6.30.x86_64-ksm/virt/kvm/kvm_main.c	2009-07-23 12:42:36.000000000 -0500
@@ -859,6 +859,19 @@ static void kvm_mmu_notifier_invalidate_
 
 }
 
+static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
+					struct mm_struct *mm,
+					unsigned long address,
+					pte_t pte)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+
+	spin_lock(&kvm->mmu_lock);
+	kvm->mmu_notifier_seq++;
+	kvm_set_spte_hva(kvm, address, pte);
+	spin_unlock(&kvm->mmu_lock);
+}
+
 static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 						    struct mm_struct *mm,
 						    unsigned long start,
@@ -938,6 +951,7 @@ static const struct mmu_notifier_ops kvm
 	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
 	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
 	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
+	.change_pte		= kvm_mmu_notifier_change_pte,
 	.release		= kvm_mmu_notifier_release,
 };
 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */

linux-2.6-ksm.patch:
 b/arch/alpha/include/asm/mman.h     |    3 
 b/arch/mips/include/asm/mman.h      |    3 
 b/arch/parisc/include/asm/mman.h    |    3 
 b/arch/xtensa/include/asm/mman.h    |    3 
 b/fs/proc/page.c                    |    5 
 b/include/asm-generic/mman-common.h |    5 
 b/include/linux/ksm.h               |   50 +
 b/include/linux/mm.h                |    1 
 b/include/linux/mmu_notifier.h      |   34 
 b/include/linux/rmap.h              |    6 
 b/include/linux/sched.h             |    7 
 b/kernel/fork.c                     |    8 
 b/mm/Kconfig                        |   11 
 b/mm/Makefile                       |    1 
 b/mm/ksm.c                          |   56 +
 b/mm/madvise.c                      |   41 
 b/mm/memory.c                       |    9 
 b/mm/mmu_notifier.c                 |   22 
 b/mm/mremap.c                       |   14 
 b/mm/rmap.c                         |   23 
 include/linux/ksm.h                 |   29 
 mm/ksm.c                            | 1506 +++++++++++++++++++++++++++++++++++-
 mm/madvise.c                        |   16 
 mm/memory.c                         |    7 
 24 files changed, 1780 insertions(+), 83 deletions(-)

--- NEW FILE linux-2.6-ksm.patch ---
From: Izik Eidus <ieidus at redhat.com>
Subject: [PATCH 01/10] ksm: add mmu_notifier set_pte_at_notify()
Date: 	Fri, 17 Jul 2009 20:30:41 +0300

The set_pte_at_notify() macro allows setting a pte in the shadow page
table directly, instead of flushing the shadow page table entry and then
getting vmexit to set it.  It uses a new change_pte() callback to do so.

set_pte_at_notify() is an optimization for kvm, and other users of
mmu_notifiers, for COW pages.  It is useful for kvm when ksm is used,
because it allows kvm not to have to receive vmexit and only then map
the ksm page into the shadow page table, but instead map it directly
at the same time as Linux maps the page into the host page table.

Users of mmu_notifiers who don't implement new mmu_notifier_change_pte()
callback will just receive the mmu_notifier_invalidate_page() callback.

Signed-off-by: Izik Eidus <ieidus at redhat.com>
Signed-off-by: Chris Wright <chrisw at redhat.com>
Signed-off-by: Hugh Dickins <hugh.dickins at tiscali.co.uk>
---
 include/linux/mmu_notifier.h |   34 ++++++++++++++++++++++++++++++++++
 mm/memory.c                  |    9 +++++++--
 mm/mmu_notifier.c            |   20 ++++++++++++++++++++
 3 files changed, 61 insertions(+), 2 deletions(-)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index b77486d..4e02ee2 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -62,6 +62,15 @@ struct mmu_notifier_ops {
 				 unsigned long address);
 
 	/*
+	 * change_pte is called in cases that pte mapping to page is changed:
+	 * for example, when ksm remaps pte to point to a new shared page.
+	 */
+	void (*change_pte)(struct mmu_notifier *mn,
+			   struct mm_struct *mm,
+			   unsigned long address,
+			   pte_t pte);
+
+	/*
 	 * Before this is invoked any secondary MMU is still ok to
 	 * read/write to the page previously pointed to by the Linux
 	 * pte because the page hasn't been freed yet and it won't be
@@ -154,6 +163,8 @@ extern void __mmu_notifier_mm_destroy(struct mm_struct *mm);
 extern void __mmu_notifier_release(struct mm_struct *mm);
 extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
 					  unsigned long address);
+extern void __mmu_notifier_change_pte(struct mm_struct *mm,
+				      unsigned long address, pte_t pte);
 extern void __mmu_notifier_invalidate_page(struct mm_struct *mm,
 					  unsigned long address);
 extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
@@ -175,6 +186,13 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
 	return 0;
 }
 
+static inline void mmu_notifier_change_pte(struct mm_struct *mm,
+					   unsigned long address, pte_t pte)
+{
+	if (mm_has_notifiers(mm))
+		__mmu_notifier_change_pte(mm, address, pte);
+}
+
 static inline void mmu_notifier_invalidate_page(struct mm_struct *mm,
 					  unsigned long address)
 {
@@ -236,6 +254,16 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 	__young;							\
 })
 
+#define set_pte_at_notify(__mm, __address, __ptep, __pte)		\
+({									\
+	struct mm_struct *___mm = __mm;					\
+	unsigned long ___address = __address;				\
+	pte_t ___pte = __pte;						\
+									\
+	set_pte_at(___mm, ___address, __ptep, ___pte);			\
+	mmu_notifier_change_pte(___mm, ___address, ___pte);		\
+})
+
 #else /* CONFIG_MMU_NOTIFIER */
 
 static inline void mmu_notifier_release(struct mm_struct *mm)
@@ -248,6 +276,11 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
 	return 0;
 }
 
+static inline void mmu_notifier_change_pte(struct mm_struct *mm,
+					   unsigned long address, pte_t pte)
+{
+}
+
 static inline void mmu_notifier_invalidate_page(struct mm_struct *mm,
 					  unsigned long address)
 {
@@ -273,6 +306,7 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 
 #define ptep_clear_flush_young_notify ptep_clear_flush_young
 #define ptep_clear_flush_notify ptep_clear_flush
+#define set_pte_at_notify set_pte_at
 
 #endif /* CONFIG_MMU_NOTIFIER */
 
diff --git a/mm/memory.c b/mm/memory.c
index 6521619..8159a62 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2113,9 +2113,14 @@ gotten:
 		 * seen in the presence of one thread doing SMC and another
 		 * thread doing COW.
 		 */
-		ptep_clear_flush_notify(vma, address, page_table);
+		ptep_clear_flush(vma, address, page_table);
 		page_add_new_anon_rmap(new_page, vma, address);
-		set_pte_at(mm, address, page_table, entry);
+		/*
+		 * We call the notify macro here because, when using secondary
+		 * mmu page tables (such as kvm shadow page tables), we want the
+		 * new page to be mapped directly into the secondary page table.
+		 */
+		set_pte_at_notify(mm, address, page_table, entry);
 		update_mmu_cache(vma, address, entry);
 		if (old_page) {
 			/*
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 5f4ef02..7e33f2c 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -99,6 +99,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
 	return young;
 }
 
+void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
+			       pte_t pte)
+{
+	struct mmu_notifier *mn;
+	struct hlist_node *n;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+		if (mn->ops->change_pte)
+			mn->ops->change_pte(mn, mm, address, pte);
+		/*
+		 * Some drivers don't have change_pte,
+		 * so we must call invalidate_page in that case.
+		 */
+		else if (mn->ops->invalidate_page)
+			mn->ops->invalidate_page(mn, mm, address);
+	}
+	rcu_read_unlock();
+}
+
 void __mmu_notifier_invalidate_page(struct mm_struct *mm,
 					  unsigned long address)
 {
-- 
1.5.6.5

--

From: Izik Eidus <ieidus at redhat.com>
Subject: [PATCH 02/10] ksm: first tidy up madvise_vma()
Date: 	Fri, 17 Jul 2009 20:30:42 +0300

madvise.c has several levels of switch statements, what to do in which?
Move MADV_DOFORK code down from madvise_vma() to madvise_behavior(), so
madvise_vma() can be a simple router, to madvise_behavior() by default.

vma->vm_flags is an unsigned long so use the same type for new_flags.
Add missing comment lines to describe MADV_DONTFORK and MADV_DOFORK.

Signed-off-by: Hugh Dickins <hugh.dickins at tiscali.co.uk>
Signed-off-by: Chris Wright <chrisw at redhat.com>
Signed-off-by: Izik Eidus <ieidus at redhat.com>
---
 mm/madvise.c |   39 +++++++++++++--------------------------
 1 files changed, 13 insertions(+), 26 deletions(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index 76eb419..66c3126 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -41,7 +41,7 @@ static long madvise_behavior(struct vm_area_struct * vma,
 	struct mm_struct * mm = vma->vm_mm;
 	int error = 0;
 	pgoff_t pgoff;
-	int new_flags = vma->vm_flags;
+	unsigned long new_flags = vma->vm_flags;
 
 	switch (behavior) {
 	case MADV_NORMAL:
@@ -57,6 +57,10 @@ static long madvise_behavior(struct vm_area_struct * vma,
 		new_flags |= VM_DONTCOPY;
 		break;
 	case MADV_DOFORK:
+		if (vma->vm_flags & VM_IO) {
[...2246 lines suppressed...]
+	unsigned long nr_pages;
+
+	err = strict_strtoul(buf, 10, &nr_pages);
+	if (err)
+		return -EINVAL;
+
+	ksm_max_kernel_pages = nr_pages;
+
+	return count;
+}
+
+static ssize_t max_kernel_pages_show(struct kobject *kobj,
+				     struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", ksm_max_kernel_pages);
+}
+KSM_ATTR(max_kernel_pages);
+
+static struct attribute *ksm_attrs[] = {
+	&sleep_millisecs_attr.attr,
+	&pages_to_scan_attr.attr,
+	&run_attr.attr,
+	&pages_shared_attr.attr,
+	&kernel_pages_allocated_attr.attr,
+	&max_kernel_pages_attr.attr,
+	NULL,
+};
+
+static struct attribute_group ksm_attr_group = {
+	.attrs = ksm_attrs,
+	.name = "ksm",
+};
+
+static int __init ksm_init(void)
+{
+	struct task_struct *ksm_thread;
+	int err;
+
+	err = ksm_slab_init();
+	if (err)
+		goto out;
+
+	err = mm_slots_hash_init();
+	if (err)
+		goto out_free1;
+
+	ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
+	if (IS_ERR(ksm_thread)) {
+		printk(KERN_ERR "ksm: creating kthread failed\n");
+		err = PTR_ERR(ksm_thread);
+		goto out_free2;
+	}
+
+	err = sysfs_create_group(mm_kobj, &ksm_attr_group);
+	if (err) {
+		printk(KERN_ERR "ksm: register sysfs failed\n");
+		goto out_free3;
+	}
+
+	return 0;
+
+out_free3:
+	kthread_stop(ksm_thread);
+out_free2:
+	mm_slots_hash_free();
+out_free1:
+	ksm_slab_free();
+out:
+	return err;
 }
+module_init(ksm_init)
-- 
1.5.6.5

--
From: Izik Eidus <ieidus at redhat.com>
Subject: [PATCH 08/10] ksm: prevent mremap move poisoning
Date: 	Fri, 17 Jul 2009 20:30:48 +0300

KSM's scan allows for user pages to be COWed or unmapped at any time,
without requiring any notification.  But its stable tree does assume
that when it finds a KSM page where it placed a KSM page, then it is
the same KSM page that it placed there.

mremap move could break that assumption: if an area containing a KSM
page was unmapped, then an area containing a different KSM page was
moved with mremap into the place of the original, before KSM's scan
came around to notice.  That could then poison a node of the stable
tree, so that memcmps would "lie" and upset the ordering of the tree.

Probably noone will ever need mremap move on a VM_MERGEABLE area;
except that prohibiting it would make trouble for schemes in which we
try making everything VM_MERGEABLE e.g. for testing: an mremap which
normally works would then fail mysteriously.

There's no need to go to any trouble, such as re-sorting KSM's list of
rmap_items to match the new layout: simply unmerge the area to COW all
its KSM pages before moving, but leave VM_MERGEABLE on so that they're
remerged later.

Signed-off-by: Hugh Dickins <hugh.dickins at tiscali.co.uk>
Signed-off-by: Chris Wright <chrisw at redhat.com>
Signed-off-by: Izik Eidus <ieidus at redhat.com>
---
 mm/mremap.c |   12 ++++++++++++
 1 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/mm/mremap.c b/mm/mremap.c
index a39b7b9..93addde 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -11,6 +11,7 @@
 #include <linux/hugetlb.h>
 #include <linux/slab.h>
 #include <linux/shm.h>
+#include <linux/ksm.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
 #include <linux/capability.h>
@@ -182,6 +183,17 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	if (mm->map_count >= sysctl_max_map_count - 3)
 		return -ENOMEM;
 
+	/*
+	 * Advise KSM to break any KSM pages in the area to be moved:
+	 * it would be confusing if they were to turn up at the new
+	 * location, where they happen to coincide with different KSM
+	 * pages recently unmapped.  But leave vma->vm_flags as it was,
+	 * so KSM can come around to merge on vma and new_vma afterwards.
+	 */
+	if (ksm_madvise(vma, old_addr, old_addr + old_len,
+						MADV_UNMERGEABLE, &vm_flags))
+		return -ENOMEM;
+
 	new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
 	new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
 	if (!new_vma)
-- 
1.5.6.5

--
From: Izik Eidus <ieidus at redhat.com>
Subject: [PATCH 09/10] ksm: change copyright message
Date: 	Fri, 17 Jul 2009 20:30:49 +0300

Adding Hugh Dickins into the authors list.

Signed-off-by: Izik Eidus <ieidus at redhat.com>
---
 mm/ksm.c |    3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/mm/ksm.c b/mm/ksm.c
index a0fbdb2..75d7802 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -4,11 +4,12 @@
  * This code enables dynamic sharing of identical pages found in different
  * memory areas, even if they are not shared by fork()
  *
- * Copyright (C) 2008 Red Hat, Inc.
+ * Copyright (C) 2008-2009 Red Hat, Inc.
  * Authors:
  *	Izik Eidus
  *	Andrea Arcangeli
  *	Chris Wright
+ *	Hugh Dickins
  *
  * This work is licensed under the terms of the GNU GPL, version 2.
  */
-- 
1.5.6.5

--
From: Izik Eidus <ieidus at redhat.com>
Subject: [PATCH 10/10] ksm: change ksm nice level to be 5
Date: 	Fri, 17 Jul 2009 20:30:50 +0300

ksm should try not to disturb other tasks as much as possible.

Signed-off-by: Izik Eidus <ieidus at redhat.com>
---
 mm/ksm.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/mm/ksm.c b/mm/ksm.c
index 75d7802..4afe345 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1270,7 +1270,7 @@ static void ksm_do_scan(unsigned int scan_npages)
 
 static int ksm_scan_thread(void *nothing)
 {
-	set_user_nice(current, 0);
+	set_user_nice(current, 5);
 
 	while (!kthread_should_stop()) {
 		if (ksm_run & KSM_RUN_MERGE) {
-- 
1.5.6.5


Index: config-generic
===================================================================
RCS file: /cvs/pkgs/rpms/kernel/devel/config-generic,v
retrieving revision 1.310
retrieving revision 1.311
diff -u -p -r1.310 -r1.311
--- config-generic	23 Jul 2009 17:45:33 -0000	1.310
+++ config-generic	23 Jul 2009 18:25:22 -0000	1.311
@@ -4018,6 +4018,7 @@ CONFIG_STRIP_ASM_SYMS=y
 
 # CONFIG_RCU_FANOUT_EXACT is not set
 
+CONFIG_KSM=y
 CONFIG_DEFAULT_MMAP_MIN_ADDR=4096
 
 CONFIG_FSNOTIFY=y


Index: kernel.spec
===================================================================
RCS file: /cvs/pkgs/rpms/kernel/devel/kernel.spec,v
retrieving revision 1.1650
retrieving revision 1.1651
diff -u -p -r1.1650 -r1.1651
--- kernel.spec	23 Jul 2009 17:45:33 -0000	1.1650
+++ kernel.spec	23 Jul 2009 18:25:22 -0000	1.1651
@@ -652,6 +652,9 @@ Patch1515: linux-2.6.31-lirc.patch
 Patch1516: lirc_streamzap-buffer-rework.patch
 Patch1517: hdpvr-ir-enable.patch
 
+Patch1550: linux-2.6-ksm.patch
+Patch1551: linux-2.6-ksm-kvm.patch
+
 # nouveau + drm fixes
 Patch1813: drm-radeon-pm.patch
 Patch1814: drm-nouveau.patch
@@ -1237,6 +1240,11 @@ ApplyPatch lirc_streamzap-buffer-rework.
 # enable IR receiver on Hauppauge HD PVR (v4l-dvb merge pending)
 ApplyPatch hdpvr-ir-enable.patch
 
+# Add kernel KSM support
+ApplyPatch linux-2.6-ksm.patch
+# Optimize KVM for KSM support
+ApplyPatch linux-2.6-ksm-kvm.patch
+
 ApplyPatch linux-2.6-e1000-ich9.patch
 
 # Nouveau DRM + drm fixes
@@ -1882,6 +1890,9 @@ fi
 # and build.
 
 %changelog
+* Thu Jul 23 2009 Justin M. Forbes <jforbes at redhat.com>
+- Add KSM support
+
 * Thu Jul 23 2009 Kyle McMartin <kyle at redhat.com> 2.6.31-0.87.rc4
 - Linux 2.6.31-rc4
 - config changes: