Patch from John Levon spinlocks are no kind of protection on UP. This patch replaces the lock with a window-based lockless reader/writer, originally by Will Cohen. oprofile/buffer_sync.c | 54 ++++++++++++++++++++--------- oprofile/cpu_buffer.c | 84 ++++++++++++++++++++++++++++------------------ oprofile/cpu_buffer.h | 7 +-- oprofile/oprofile_stats.c | 3 - 4 files changed, 92 insertions(+), 56 deletions(-) diff -puN drivers/oprofile/buffer_sync.c~oprofile-up-fix drivers/oprofile/buffer_sync.c --- 25/drivers/oprofile/buffer_sync.c~oprofile-up-fix 2003-02-23 22:39:23.000000000 -0800 +++ 25-akpm/drivers/oprofile/buffer_sync.c 2003-02-23 22:39:23.000000000 -0800 @@ -277,10 +277,7 @@ static void release_mm(struct mm_struct */ static struct mm_struct * take_task_mm(struct task_struct * task) { - struct mm_struct * mm; - task_lock(task); - mm = task->mm; - task_unlock(task); + struct mm_struct * mm = task->mm; /* if task->mm !NULL, mm_count must be at least 1. It cannot * drop to 0 without the task exiting, which will have to sleep @@ -310,6 +307,32 @@ static inline int is_ctx_switch(unsigned } +/* compute number of filled slots in cpu_buffer queue */ +static unsigned long nr_filled_slots(struct oprofile_cpu_buffer * b) +{ + unsigned long head = b->head_pos; + unsigned long tail = b->tail_pos; + + if (head >= tail) + return head - tail; + + return head + (b->buffer_size - tail); +} + + +static void increment_tail(struct oprofile_cpu_buffer * b) +{ + unsigned long new_tail = b->tail_pos + 1; + + rmb(); + + if (new_tail < (b->buffer_size)) + b->tail_pos = new_tail; + else + b->tail_pos = 0; +} + + /* Sync one of the CPU's buffers into the global event buffer. * Here we need to go through each batch of samples punctuated * by context switch notes, taking the task's mmap_sem and doing @@ -324,8 +347,12 @@ static void sync_buffer(struct oprofile_ int in_kernel = 1; int i; - for (i=0; i < cpu_buf->pos; ++i) { - struct op_sample * s = &cpu_buf->buffer[i]; + /* Remember, only we can modify tail_pos */ + + unsigned long const available_elements = nr_filled_slots(cpu_buf); + + for (i=0; i < available_elements; ++i) { + struct op_sample * s = &cpu_buf->buffer[cpu_buf->tail_pos]; if (is_ctx_switch(s->eip)) { if (s->event <= 1) { @@ -345,6 +372,8 @@ static void sync_buffer(struct oprofile_ } else { add_sample(mm, s, in_kernel); } + + increment_tail(cpu_buf); } release_mm(mm); @@ -369,17 +398,8 @@ static void sync_cpu_buffers(void) cpu_buf = &cpu_buffer[i]; - /* We take a spin lock even though we might - * sleep. It's OK because other users are try - * lockers only, and this region is already - * protected by buffer_sem. It's raw to prevent - * the preempt bogometer firing. Fruity, huh ? */ - if (cpu_buf->pos > 0) { - _raw_spin_lock(&cpu_buf->int_lock); - add_cpu_switch(i); - sync_buffer(cpu_buf); - _raw_spin_unlock(&cpu_buf->int_lock); - } + add_cpu_switch(i); + sync_buffer(cpu_buf); } up(&buffer_sem); diff -puN drivers/oprofile/cpu_buffer.c~oprofile-up-fix drivers/oprofile/cpu_buffer.c --- 25/drivers/oprofile/cpu_buffer.c~oprofile-up-fix 2003-02-23 22:39:23.000000000 -0800 +++ 25-akpm/drivers/oprofile/cpu_buffer.c 2003-02-23 22:39:23.000000000 -0800 @@ -26,8 +26,6 @@ struct oprofile_cpu_buffer cpu_buffer[NR_CPUS] __cacheline_aligned; -static unsigned long buffer_size; - static void __free_cpu_buffers(int num) { int i; @@ -47,7 +45,7 @@ int alloc_cpu_buffers(void) { int i; - buffer_size = fs_cpu_buffer_size; + unsigned long buffer_size = fs_cpu_buffer_size; for (i=0; i < NR_CPUS; ++i) { struct oprofile_cpu_buffer * b = &cpu_buffer[i]; @@ -59,12 +57,12 @@ int alloc_cpu_buffers(void) if (!b->buffer) goto fail; - spin_lock_init(&b->int_lock); - b->pos = 0; b->last_task = 0; b->last_is_kernel = -1; + b->buffer_size = buffer_size; + b->tail_pos = 0; + b->head_pos = 0; b->sample_received = 0; - b->sample_lost_locked = 0; b->sample_lost_overflow = 0; b->sample_lost_task_exit = 0; } @@ -80,11 +78,41 @@ void free_cpu_buffers(void) __free_cpu_buffers(NR_CPUS); } - -/* Note we can't use a semaphore here as this is supposed to - * be safe from any context. Instead we trylock the CPU's int_lock. - * int_lock is taken by the processing code in sync_cpu_buffers() - * so we avoid disturbing that. + +/* compute number of available slots in cpu_buffer queue */ +static unsigned long nr_available_slots(struct oprofile_cpu_buffer const * b) +{ + unsigned long head = b->head_pos; + unsigned long tail = b->tail_pos; + + if (tail == head) + return b->buffer_size; + + if (tail > head) + return tail - head; + + return tail + (b->buffer_size - head); +} + + +static void increment_head(struct oprofile_cpu_buffer * b) +{ + unsigned long new_head = b->head_pos + 1; + + /* Ensure anything written to the slot before we + * increment is visible */ + wmb(); + + if (new_head < (b->buffer_size)) + b->head_pos = new_head; + else + b->head_pos = 0; +} + + +/* This must be safe from any context. It's safe writing here + * because of the head/tail separation of the writer and reader + * of the CPU buffer. * * is_kernel is needed because on some architectures you cannot * tell if you are in kernel or user space simply by looking at @@ -101,14 +129,10 @@ void oprofile_add_sample(unsigned long e cpu_buf->sample_received++; - if (!spin_trylock(&cpu_buf->int_lock)) { - cpu_buf->sample_lost_locked++; - return; - } - if (cpu_buf->pos > buffer_size - 2) { + if (nr_available_slots(cpu_buf) < 3) { cpu_buf->sample_lost_overflow++; - goto out; + return; } task = current; @@ -116,18 +140,18 @@ void oprofile_add_sample(unsigned long e /* notice a switch from user->kernel or vice versa */ if (cpu_buf->last_is_kernel != is_kernel) { cpu_buf->last_is_kernel = is_kernel; - cpu_buf->buffer[cpu_buf->pos].eip = ~0UL; - cpu_buf->buffer[cpu_buf->pos].event = is_kernel; - cpu_buf->pos++; + cpu_buf->buffer[cpu_buf->head_pos].eip = ~0UL; + cpu_buf->buffer[cpu_buf->head_pos].event = is_kernel; + increment_head(cpu_buf); } /* notice a task switch */ if (cpu_buf->last_task != task) { cpu_buf->last_task = task; if (!(task->flags & PF_EXITING)) { - cpu_buf->buffer[cpu_buf->pos].eip = ~0UL; - cpu_buf->buffer[cpu_buf->pos].event = (unsigned long)task; - cpu_buf->pos++; + cpu_buf->buffer[cpu_buf->head_pos].eip = ~0UL; + cpu_buf->buffer[cpu_buf->head_pos].event = (unsigned long)task; + increment_head(cpu_buf); } } @@ -138,23 +162,20 @@ void oprofile_add_sample(unsigned long e */ if (task->flags & PF_EXITING) { cpu_buf->sample_lost_task_exit++; - goto out; + return; } - cpu_buf->buffer[cpu_buf->pos].eip = eip; - cpu_buf->buffer[cpu_buf->pos].event = event; - cpu_buf->pos++; -out: - spin_unlock(&cpu_buf->int_lock); + cpu_buf->buffer[cpu_buf->head_pos].eip = eip; + cpu_buf->buffer[cpu_buf->head_pos].event = event; + increment_head(cpu_buf); } + /* resets the cpu buffer to a sane state - should be called with * cpu_buf->int_lock held */ void cpu_buffer_reset(struct oprofile_cpu_buffer *cpu_buf) { - cpu_buf->pos = 0; - /* reset these to invalid values; the next sample * collected will populate the buffer with proper * values to initialize the buffer @@ -162,4 +183,3 @@ void cpu_buffer_reset(struct oprofile_cp cpu_buf->last_is_kernel = -1; cpu_buf->last_task = 0; } - diff -puN drivers/oprofile/cpu_buffer.h~oprofile-up-fix drivers/oprofile/cpu_buffer.h --- 25/drivers/oprofile/cpu_buffer.h~oprofile-up-fix 2003-02-23 22:39:23.000000000 -0800 +++ 25-akpm/drivers/oprofile/cpu_buffer.h 2003-02-23 22:39:23.000000000 -0800 @@ -30,14 +30,13 @@ struct op_sample { }; struct oprofile_cpu_buffer { - spinlock_t int_lock; - /* protected by int_lock */ - unsigned long pos; + volatile unsigned long head_pos; + volatile unsigned long tail_pos; + unsigned long buffer_size; struct task_struct * last_task; int last_is_kernel; struct op_sample * buffer; unsigned long sample_received; - unsigned long sample_lost_locked; unsigned long sample_lost_overflow; unsigned long sample_lost_task_exit; } ____cacheline_aligned; diff -puN drivers/oprofile/oprofile_stats.c~oprofile-up-fix drivers/oprofile/oprofile_stats.c --- 25/drivers/oprofile/oprofile_stats.c~oprofile-up-fix 2003-02-23 22:39:23.000000000 -0800 +++ 25-akpm/drivers/oprofile/oprofile_stats.c 2003-02-23 22:39:23.000000000 -0800 @@ -27,7 +27,6 @@ void oprofile_reset_stats(void) cpu_buf = &cpu_buffer[i]; cpu_buf->sample_received = 0; - cpu_buf->sample_lost_locked = 0; cpu_buf->sample_lost_overflow = 0; cpu_buf->sample_lost_task_exit = 0; } @@ -63,8 +62,6 @@ void oprofile_create_stats_files(struct */ oprofilefs_create_ro_ulong(sb, cpudir, "sample_received", &cpu_buf->sample_received); - oprofilefs_create_ro_ulong(sb, cpudir, "sample_lost_locked", - &cpu_buf->sample_lost_locked); oprofilefs_create_ro_ulong(sb, cpudir, "sample_lost_overflow", &cpu_buf->sample_lost_overflow); oprofilefs_create_ro_ulong(sb, cpudir, "sample_lost_task_exit", _