[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]

RE: Poor thread performance on Linux vs. Solaris



Try the futex_q_lock-0.2 patch. It is also against linux-2.6.0-test4. 

It does the following things:
* Remove the global futex_lock as the previous futex_q_lock patch did.
* Add bucket spinlock recursively check as Jakub mentioned.
* Move vcache_lock out of lock/unlock_futex_mm() and only to protect the actual vcache operations.
* Shrink some lock/unlock_futex_mm() scopes.

boris

--- linux-2.6.0-test4.orig/kernel/futex.c	2003-08-23 07:53:39.000000000 +0800
+++ linux-2.6.0-test4/kernel/futex.c	2003-09-09 14:15:02.000000000 +0800
@@ -57,9 +57,16 @@
 	struct file *filp;
 };
 
+/* 
+ * Split the global futex_lock into every hash list lock.
+ */ 
+struct futex_hash_bucket {
+	struct list_head	chain;
+	spinlock_t		lock;
+};
+
 /* The key for the hash is the address + index + offset within page */
-static struct list_head futex_queues[1<<FUTEX_HASHBITS];
-static spinlock_t futex_lock = SPIN_LOCK_UNLOCKED;
+static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
 
 extern void send_sigio(struct fown_struct *fown, int fd, int band);
 
@@ -73,21 +80,17 @@
 static inline void lock_futex_mm(void)
 {
 	spin_lock(&current->mm->page_table_lock);
-	spin_lock(&vcache_lock);
-	spin_lock(&futex_lock);
 }
 
 static inline void unlock_futex_mm(void)
 {
-	spin_unlock(&futex_lock);
-	spin_unlock(&vcache_lock);
 	spin_unlock(&current->mm->page_table_lock);
 }
 
 /*
  * The physical page is shared, so we can hash on its address:
  */
-static inline struct list_head *hash_futex(struct page *page, int offset)
+static inline struct futex_hash_bucket *hash_futex(struct page *page, int offset)
 {
 	return &futex_queues[hash_long((unsigned long)page + offset,
 							FUTEX_HASHBITS)];
@@ -153,6 +156,7 @@
 static inline int futex_wake(unsigned long uaddr, int offset, int num)
 {
 	struct list_head *i, *next, *head;
+	struct futex_hash_bucket *bh;
 	struct page *page;
 	int ret = 0;
 
@@ -164,14 +168,22 @@
 		return -EFAULT;
 	}
 
-	head = hash_futex(page, offset);
+	unlock_futex_mm();
+
+	bh = hash_futex(page, offset);
+	spin_lock(&bh->lock);
+	head = &bh->chain;
 
 	list_for_each_safe(i, next, head) {
 		struct futex_q *this = list_entry(i, struct futex_q, list);
 
 		if (this->page == page && this->offset == offset) {
 			list_del_init(i);
+
+			spin_lock(&vcache_lock);
 			__detach_vcache(&this->vcache);
+			spin_unlock(&vcache_lock);
+
 			wake_up_all(&this->waiters);
 			if (this->filp)
 				send_sigio(&this->filp->f_owner, this->fd, POLL_IN);
@@ -180,8 +192,8 @@
 				break;
 		}
 	}
+	spin_unlock(&bh->lock);
 
-	unlock_futex_mm();
 	put_page(page);
 
 	return ret;
@@ -196,19 +208,19 @@
 static void futex_vcache_callback(vcache_t *vcache, struct page *new_page)
 {
 	struct futex_q *q = container_of(vcache, struct futex_q, vcache);
-	struct list_head *head = hash_futex(new_page, q->offset);
+	struct futex_hash_bucket *head = hash_futex(new_page, q->offset);
 
-	spin_lock(&futex_lock);
+	spin_lock(&head->lock);
 
 	if (!list_empty(&q->list)) {
 		put_page(q->page);
 		q->page = new_page;
 		__pin_page_atomic(new_page);
 		list_del(&q->list);
-		list_add_tail(&q->list, head);
+		list_add_tail(&q->list, &head->chain);
 	}
 
-	spin_unlock(&futex_lock);
+	spin_unlock(&head->lock);
 }
 
 /*
@@ -219,6 +231,7 @@
 	unsigned long uaddr2, int offset2, int nr_wake, int nr_requeue)
 {
 	struct list_head *i, *next, *head1, *head2;
+	struct futex_hash_bucket *bh1, *bh2;
 	struct page *page1 = NULL, *page2 = NULL;
 	int ret = 0;
 
@@ -231,26 +244,38 @@
 	if (!page2)
 		goto out;
 
-	head1 = hash_futex(page1, offset1);
-	head2 = hash_futex(page2, offset2);
+	unlock_futex_mm();
 
+	bh1 = hash_futex(page1, offset1);
+	bh2 = hash_futex(page2, offset2);
+	spin_lock(&bh1->lock);
+	if (bh1 != bh2)
+		spin_lock(&bh2->lock);
+	head1 = &bh1->chain;
+	head2 = &bh2->chain;
+       
 	list_for_each_safe(i, next, head1) {
 		struct futex_q *this = list_entry(i, struct futex_q, list);
 
 		if (this->page == page1 && this->offset == offset1) {
 			list_del_init(i);
+			spin_lock(&vcache_lock);
 			__detach_vcache(&this->vcache);
 			if (++ret <= nr_wake) {
+				spin_unlock(&vcache_lock);
 				wake_up_all(&this->waiters);
 				if (this->filp)
 					send_sigio(&this->filp->f_owner,
 							this->fd, POLL_IN);
 			} else {
 				put_page(this->page);
+				lock_futex_mm();
 				__pin_page_atomic (page2);
 				list_add_tail(i, head2);
 				__attach_vcache(&this->vcache, uaddr2,
 					current->mm, futex_vcache_callback);
+				unlock_futex_mm();
+				spin_unlock(&vcache_lock);
 				this->offset = offset2;
 				this->page = page2;
 				if (ret - nr_wake >= nr_requeue)
@@ -260,7 +285,9 @@
 	}
 
 out:
-	unlock_futex_mm();
+	if (bh1 != bh2)
+		spin_unlock(&bh2->lock);
+	spin_unlock(&bh1->lock);
 
 	if (page1)
 		put_page(page1);
@@ -274,7 +301,7 @@
 				unsigned long uaddr, int offset,
 				int fd, struct file *filp)
 {
-	struct list_head *head = hash_futex(page, offset);
+	struct list_head *head = &hash_futex(page, offset)->chain;
 
 	q->offset = offset;
 	q->fd = fd;
@@ -293,15 +320,16 @@
 static inline int unqueue_me(struct futex_q *q)
 {
 	int ret = 0;
+	struct futex_hash_bucket *bh = hash_futex(q->page, q->offset);
 
 	spin_lock(&vcache_lock);
-	spin_lock(&futex_lock);
+	spin_lock(&bh->lock);
 	if (!list_empty(&q->list)) {
 		list_del(&q->list);
 		__detach_vcache(&q->vcache);
 		ret = 1;
 	}
-	spin_unlock(&futex_lock);
+	spin_unlock(&bh->lock);
 	spin_unlock(&vcache_lock);
 	return ret;
 }
@@ -315,6 +343,7 @@
 	int ret = 0, curval;
 	struct page *page;
 	struct futex_q q;
+	struct futex_hash_bucket *bh;
 
 	init_waitqueue_head(&q.waiters);
 
@@ -325,18 +354,27 @@
 		unlock_futex_mm();
 		return -EFAULT;
 	}
+	
+	bh = hash_futex(page, offset);
+	spin_lock(&bh->lock);
+	spin_lock(&vcache_lock);
+
 	__queue_me(&q, page, uaddr, offset, -1, NULL);
 
+	spin_unlock(&vcache_lock);
+
 	/*
 	 * Page is pinned, but may no longer be in this address space.
 	 * It cannot schedule, so we access it with the spinlock held.
 	 */
 	if (get_user(curval, (int *)uaddr) != 0) {
+		spin_unlock(&bh->lock);
 		unlock_futex_mm();
 		ret = -EFAULT;
 		goto out;
 	}
 	if (curval != val) {
+		spin_unlock(&bh->lock);
 		unlock_futex_mm();
 		ret = -EWOULDBLOCK;
 		goto out;
@@ -351,6 +389,7 @@
 	add_wait_queue(&q.waiters, &wait);
 	set_current_state(TASK_INTERRUPTIBLE);
 	if (!list_empty(&q.list)) {
+		spin_unlock(&bh->lock);
 		unlock_futex_mm();
 		time = schedule_timeout(time);
 	}
@@ -389,13 +428,14 @@
 			       struct poll_table_struct *wait)
 {
 	struct futex_q *q = filp->private_data;
+	struct futex_hash_bucket *bh = hash_futex(q->page, q->offset);
 	int ret = 0;
 
 	poll_wait(filp, &q->waiters, wait);
-	spin_lock(&futex_lock);
+	spin_lock(&bh->lock);
 	if (list_empty(&q->list))
 		ret = POLLIN | POLLRDNORM;
-	spin_unlock(&futex_lock);
+	spin_unlock(&bh->lock);
 
 	return ret;
 }
@@ -411,6 +451,7 @@
 {
 	struct page *page = NULL;
 	struct futex_q *q;
+	struct futex_hash_bucket *bh;
 	struct file *filp;
 	int ret;
 
@@ -466,8 +507,14 @@
 	init_waitqueue_head(&q->waiters);
 	filp->private_data = q;
 
+	bh = hash_futex(page, offset);
+	spin_lock(&bh->lock);
+	spin_lock(&vcache_lock);
+
 	__queue_me(q, page, uaddr, offset, ret, filp);
 
+	spin_unlock(&vcache_lock);
+	spin_unlock(&bh->lock);
 	unlock_futex_mm();
 
 	/* Now we map fd to filp, so userspace can access it */
@@ -563,8 +610,10 @@
 	register_filesystem(&futex_fs_type);
 	futex_mnt = kern_mount(&futex_fs_type);
 
-	for (i = 0; i < ARRAY_SIZE(futex_queues); i++)
-		INIT_LIST_HEAD(&futex_queues[i]);
+	for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
+		futex_queues[i].lock = SPIN_LOCK_UNLOCKED;
+		INIT_LIST_HEAD(&futex_queues[i].chain);
+	}
 	return 0;
 }
 __initcall(init);

> 
> > From: Bill Soudan [mailto:bsoudan brass com]
> >
> > It applies cleanly and the performance test runs to completion, but
> > unfortunately, it doesn't appear to make much of a difference wrt to
> > system time.  I'm going to spend some time with oprofile again now that
> I
> > have your patch applied, and see if I receive different results.
> 
> It'd be interesting to verify if there is still a lot of contention in
> the locking.
> 
> A question about your app: are there many different locks used at the
> same
> time or the number is actually low?
> 
> No matter how we do it, there is still a bunch of lock acquisition in
> the futex code that would serialize access [specially if they are locks
> within the same thread, like current->mm->page_table_lock].
> 
> And then we also have vcache_lock--this one is also serializing
> everything,
> as it is taken unconditionally whenever we call lock_futex_mm().
> 
> Probably if we took vcache_lock() just before doing any actual vcache
> operation, we'd at least improve the preemptability on that.
> 
> We still need to deal with current->mm->page_table_lock. I am kind of
> more green on that one, but here is a question: wouldn't it be possible
> to drop it once we have pinned the page? This is the one that is going
> to hurt when interlocking between threads of the same process.
> 
> Iñaky Pérez-González -- Not speaking for Intel -- all opinions are my
> own (and my fault)

Attachment: futex_q_lock-0.2.diff
Description: futex_q_lock-0.2.diff


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]