/* * This file implements the perfmon subsystem which is used * to program the IA-64 Performance Monitoring Unit (PMU). * * Originaly Written by Ganesh Venkitachalam, IBM Corp. * Copyright (C) 1999 Ganesh Venkitachalam * * Modifications by Stephane Eranian, Hewlett-Packard Co. * Modifications by David Mosberger-Tang, Hewlett-Packard Co. * * Copyright (C) 1999-2002 Hewlett Packard Co * Stephane Eranian * David Mosberger-Tang */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for ia64_get_itc() */ #ifdef CONFIG_PERFMON /* * For PMUs which rely on the debug registers for some features, you must * you must enable the following flag to activate the support for * accessing the registers via the perfmonctl() interface. */ #ifdef CONFIG_ITANIUM #define PFM_PMU_USES_DBR 1 #endif /* * perfmon context states */ #define PFM_CTX_DISABLED 0 #define PFM_CTX_ENABLED 1 /* * Reset register flags */ #define PFM_RELOAD_LONG_RESET 1 #define PFM_RELOAD_SHORT_RESET 2 /* * Misc macros and definitions */ #define PMU_FIRST_COUNTER 4 #define PFM_IS_DISABLED() pmu_conf.pfm_is_disabled #define PMC_OVFL_NOTIFY(ctx, i) ((ctx)->ctx_soft_pmds[i].flags & PFM_REGFL_OVFL_NOTIFY) #define PFM_FL_INHERIT_MASK (PFM_FL_INHERIT_NONE|PFM_FL_INHERIT_ONCE|PFM_FL_INHERIT_ALL) #define PMC_IS_IMPL(i) (i>6] & (1UL<< (i) %64)) #define PMD_IS_IMPL(i) (i>6)] & (1UL<<(i) % 64)) #define PMD_IS_COUNTING(i) (i >=0 && i < 256 && pmu_conf.counter_pmds[i>>6] & (1UL <<(i) % 64)) #define PMC_IS_COUNTING(i) PMD_IS_COUNTING(i) #define IBR_IS_IMPL(k) (kpmc_es == PMU_BTB_EVENT) #define LSHIFT(x) (1UL<<(x)) #define PMM(x) LSHIFT(x) #define PMC_IS_MONITOR(c) ((pmu_conf.monitor_pmcs[0] & PMM((c))) != 0) #define CTX_IS_ENABLED(c) ((c)->ctx_flags.state == PFM_CTX_ENABLED) #define CTX_OVFL_NOBLOCK(c) ((c)->ctx_fl_block == 0) #define CTX_INHERIT_MODE(c) ((c)->ctx_fl_inherit) #define CTX_HAS_SMPL(c) ((c)->ctx_psb != NULL) #define CTX_USED_PMD(ctx,n) (ctx)->ctx_used_pmds[(n)>>6] |= 1UL<< ((n) % 64) #define CTX_USED_IBR(ctx,n) (ctx)->ctx_used_ibrs[(n)>>6] |= 1UL<< ((n) % 64) #define CTX_USED_DBR(ctx,n) (ctx)->ctx_used_dbrs[(n)>>6] |= 1UL<< ((n) % 64) #define CTX_USES_DBREGS(ctx) (((pfm_context_t *)(ctx))->ctx_fl_using_dbreg==1) #define LOCK_CTX(ctx) spin_lock(&(ctx)->ctx_lock) #define UNLOCK_CTX(ctx) spin_unlock(&(ctx)->ctx_lock) #define SET_PMU_OWNER(t) do { pmu_owners[smp_processor_id()].owner = (t); } while(0) #define PMU_OWNER() pmu_owners[smp_processor_id()].owner #define LOCK_PFS() spin_lock(&pfm_sessions.pfs_lock) #define UNLOCK_PFS() spin_unlock(&pfm_sessions.pfs_lock) #define PFM_REG_RETFLAG_SET(flags, val) do { flags &= ~PFM_REG_RETFL_MASK; flags |= (val); } while(0) /* * debugging */ #define DBprintk(a) \ do { \ if (pfm_debug_mode >0) { printk("%s.%d: CPU%d ", __FUNCTION__, __LINE__, smp_processor_id()); printk a; } \ } while (0) /* * These are some helpful architected PMC and IBR/DBR register layouts */ typedef struct { unsigned long pmc_plm:4; /* privilege level mask */ unsigned long pmc_ev:1; /* external visibility */ unsigned long pmc_oi:1; /* overflow interrupt */ unsigned long pmc_pm:1; /* privileged monitor */ unsigned long pmc_ig1:1; /* reserved */ unsigned long pmc_es:8; /* event select */ unsigned long pmc_ig2:48; /* reserved */ } pfm_monitor_t; /* * There is one such data structure per perfmon context. It is used to describe the * sampling buffer. It is to be shared among siblings whereas the pfm_context * is not. * Therefore we maintain a refcnt which is incremented on fork(). * This buffer is private to the kernel only the actual sampling buffer * including its header are exposed to the user. This construct allows us to * export the buffer read-write, if needed, without worrying about security * problems. */ typedef struct _pfm_smpl_buffer_desc { spinlock_t psb_lock; /* protection lock */ unsigned long psb_refcnt; /* how many users for the buffer */ int psb_flags; /* bitvector of flags */ void *psb_addr; /* points to location of first entry */ unsigned long psb_entries; /* maximum number of entries */ unsigned long psb_size; /* aligned size of buffer */ unsigned long psb_index; /* next free entry slot XXX: must use the one in buffer */ unsigned long psb_entry_size; /* size of each entry including entry header */ perfmon_smpl_hdr_t *psb_hdr; /* points to sampling buffer header */ struct _pfm_smpl_buffer_desc *psb_next; /* next psb, used for rvfreeing of psb_hdr */ } pfm_smpl_buffer_desc_t; #define LOCK_PSB(p) spin_lock(&(p)->psb_lock) #define UNLOCK_PSB(p) spin_unlock(&(p)->psb_lock) #define PFM_PSB_VMA 0x1 /* a VMA is describing the buffer */ /* * This structure is initialized at boot time and contains * a description of the PMU main characteristic as indicated * by PAL */ typedef struct { unsigned long pfm_is_disabled; /* indicates if perfmon is working properly */ unsigned long perf_ovfl_val; /* overflow value for generic counters */ unsigned long max_counters; /* upper limit on counter pair (PMC/PMD) */ unsigned long num_pmcs ; /* highest PMC implemented (may have holes) */ unsigned long num_pmds; /* highest PMD implemented (may have holes) */ unsigned long impl_regs[16]; /* buffer used to hold implememted PMC/PMD mask */ unsigned long num_ibrs; /* number of instruction debug registers */ unsigned long num_dbrs; /* number of data debug registers */ unsigned long monitor_pmcs[4]; /* which pmc are controlling monitors */ unsigned long counter_pmds[4]; /* which pmd are used as counters */ } pmu_config_t; /* * 64-bit software counter structure */ typedef struct { u64 val; /* virtual 64bit counter value */ u64 ival; /* initial value from user */ u64 long_reset; /* reset value on sampling overflow */ u64 short_reset;/* reset value on overflow */ u64 reset_pmds[4]; /* which other pmds to reset when this counter overflows */ int flags; /* notify/do not notify */ } pfm_counter_t; /* * perfmon context. One per process, is cloned on fork() depending on * inheritance flags */ typedef struct { unsigned int state:1; /* 0=disabled, 1=enabled */ unsigned int inherit:2; /* inherit mode */ unsigned int block:1; /* when 1, task will blocked on user notifications */ unsigned int system:1; /* do system wide monitoring */ unsigned int frozen:1; /* pmu must be kept frozen on ctxsw in */ unsigned int protected:1; /* allow access to creator of context only */ unsigned int using_dbreg:1; /* using range restrictions (debug registers) */ unsigned int reserved:24; } pfm_context_flags_t; /* * perfmon context: encapsulates all the state of a monitoring session * XXX: probably need to change layout */ typedef struct pfm_context { pfm_smpl_buffer_desc_t *ctx_psb; /* sampling buffer, if any */ unsigned long ctx_smpl_vaddr; /* user level virtual address of smpl buffer */ spinlock_t ctx_lock; pfm_context_flags_t ctx_flags; /* block/noblock */ struct task_struct *ctx_notify_task; /* who to notify on overflow */ struct task_struct *ctx_owner; /* pid of creator (debug) */ unsigned long ctx_ovfl_regs[4]; /* which registers overflowed (notification) */ unsigned long ctx_smpl_regs[4]; /* which registers to record on overflow */ struct semaphore ctx_restart_sem; /* use for blocking notification mode */ unsigned long ctx_used_pmds[4]; /* bitmask of used PMD (speedup ctxsw) */ unsigned long ctx_saved_pmcs[4]; /* bitmask of PMC to save on ctxsw */ unsigned long ctx_reload_pmcs[4]; /* bitmask of PMC to reload on ctxsw (SMP) */ unsigned long ctx_used_ibrs[4]; /* bitmask of used IBR (speedup ctxsw) */ unsigned long ctx_used_dbrs[4]; /* bitmask of used DBR (speedup ctxsw) */ pfm_counter_t ctx_soft_pmds[IA64_NUM_PMD_REGS]; /* XXX: size should be dynamic */ u64 ctx_saved_psr; /* copy of psr used for lazy ctxsw */ unsigned long ctx_saved_cpus_allowed; /* copy of the task cpus_allowed (system wide) */ unsigned long ctx_cpu; /* cpu to which perfmon is applied (system wide) */ atomic_t ctx_saving_in_progress; /* flag indicating actual save in progress */ atomic_t ctx_last_cpu; /* CPU id of current or last CPU used */ } pfm_context_t; #define ctx_fl_inherit ctx_flags.inherit #define ctx_fl_block ctx_flags.block #define ctx_fl_system ctx_flags.system #define ctx_fl_frozen ctx_flags.frozen #define ctx_fl_protected ctx_flags.protected #define ctx_fl_using_dbreg ctx_flags.using_dbreg /* * global information about all sessions * mostly used to synchronize between system wide and per-process */ typedef struct { spinlock_t pfs_lock; /* lock the structure */ unsigned long pfs_task_sessions; /* number of per task sessions */ unsigned long pfs_sys_sessions; /* number of per system wide sessions */ unsigned long pfs_sys_use_dbregs; /* incremented when a system wide session uses debug regs */ unsigned long pfs_ptrace_use_dbregs; /* incremented when a process uses debug regs */ struct task_struct *pfs_sys_session[NR_CPUS]; /* point to task owning a system-wide session */ } pfm_session_t; /* * structure used to pass argument to/from remote CPU * using IPI to check and possibly save the PMU context on SMP systems. * * not used in UP kernels */ typedef struct { struct task_struct *task; /* which task we are interested in */ int retval; /* return value of the call: 0=you can proceed, 1=need to wait for completion */ } pfm_smp_ipi_arg_t; /* * perfmon command descriptions */ typedef struct { int (*cmd_func)(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs); int cmd_flags; unsigned int cmd_narg; size_t cmd_argsize; } pfm_cmd_desc_t; #define PFM_CMD_PID 0x1 /* command requires pid argument */ #define PFM_CMD_ARG_READ 0x2 /* command must read argument(s) */ #define PFM_CMD_ARG_WRITE 0x4 /* command must write argument(s) */ #define PFM_CMD_CTX 0x8 /* command needs a perfmon context */ #define PFM_CMD_NOCHK 0x10 /* command does not need to check task's state */ #define PFM_CMD_IDX(cmd) (cmd) #define PFM_CMD_IS_VALID(cmd) ((PFM_CMD_IDX(cmd) >= 0) && (PFM_CMD_IDX(cmd) < PFM_CMD_COUNT) \ && pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_func != NULL) #define PFM_CMD_USE_PID(cmd) ((pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_flags & PFM_CMD_PID) != 0) #define PFM_CMD_READ_ARG(cmd) ((pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_flags & PFM_CMD_ARG_READ) != 0) #define PFM_CMD_WRITE_ARG(cmd) ((pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_flags & PFM_CMD_ARG_WRITE) != 0) #define PFM_CMD_USE_CTX(cmd) ((pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_flags & PFM_CMD_CTX) != 0) #define PFM_CMD_CHK(cmd) ((pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_flags & PFM_CMD_NOCHK) == 0) #define PFM_CMD_ARG_MANY -1 /* cannot be zero */ #define PFM_CMD_NARG(cmd) (pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_narg) #define PFM_CMD_ARG_SIZE(cmd) (pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_argsize) /* * perfmon internal variables */ static pmu_config_t pmu_conf; /* PMU configuration */ static int pfm_debug_mode; /* 0= nodebug, >0= debug output on */ static pfm_session_t pfm_sessions; /* global sessions information */ static struct proc_dir_entry *perfmon_dir; /* for debug only */ static unsigned long pfm_spurious_ovfl_intr_count; /* keep track of spurious ovfl interrupts */ static unsigned long pfm_ovfl_intr_count; /* keep track of spurious ovfl interrupts */ static unsigned long pfm_recorded_samples_count; static unsigned long reset_pmcs[IA64_NUM_PMC_REGS]; /* contains PAL reset values for PMCS */ static void pfm_vm_close(struct vm_area_struct * area); static struct vm_operations_struct pfm_vm_ops={ close: pfm_vm_close }; /* * keep track of task owning the PMU per CPU. */ static struct { struct task_struct *owner; } ____cacheline_aligned pmu_owners[NR_CPUS]; /* * forward declarations */ static void ia64_reset_pmu(struct task_struct *); #ifdef CONFIG_SMP static void pfm_fetch_regs(int cpu, struct task_struct *task, pfm_context_t *ctx); #endif static void pfm_lazy_save_regs (struct task_struct *ta); static inline unsigned long pfm_read_soft_counter(pfm_context_t *ctx, int i) { return ctx->ctx_soft_pmds[i].val + (ia64_get_pmd(i) & pmu_conf.perf_ovfl_val); } static inline void pfm_write_soft_counter(pfm_context_t *ctx, int i, unsigned long val) { ctx->ctx_soft_pmds[i].val = val & ~pmu_conf.perf_ovfl_val; /* * writing to unimplemented part is ignore, so we do not need to * mask off top part */ ia64_set_pmd(i, val); } /* * finds the number of PM(C|D) registers given * the bitvector returned by PAL */ static unsigned long __init find_num_pm_regs(long *buffer) { int i=3; /* 4 words/per bitvector */ /* start from the most significant word */ while (i>=0 && buffer[i] == 0 ) i--; if (i< 0) { printk(KERN_ERR "perfmon: No bit set in pm_buffer\n"); return 0; } return 1+ ia64_fls(buffer[i]) + 64 * i; } /* * Generates a unique (per CPU) timestamp */ static inline unsigned long pfm_get_stamp(void) { /* * XXX: must find something more efficient */ return ia64_get_itc(); } /* Given PGD from the address space's page table, return the kernel * virtual mapping of the physical memory mapped at ADR. */ static inline unsigned long uvirt_to_kva(pgd_t *pgd, unsigned long adr) { unsigned long ret = 0UL; pmd_t *pmd; pte_t *ptep, pte; if (!pgd_none(*pgd)) { pmd = pmd_offset(pgd, adr); if (!pmd_none(*pmd)) { ptep = pte_offset(pmd, adr); pte = *ptep; if (pte_present(pte)) { ret = (unsigned long) page_address(pte_page(pte)); ret |= (adr & (PAGE_SIZE - 1)); } } } DBprintk(("[%d] uv2kva(%lx-->%lx)\n", current->pid, adr, ret)); return ret; } /* Here we want the physical address of the memory. * This is used when initializing the contents of the * area and marking the pages as reserved. */ static inline unsigned long pfm_kvirt_to_pa(unsigned long adr) { __u64 pa = ia64_tpa(adr); //DBprintk(("kv2pa(%lx-->%lx)\n", adr, pa)); return pa; } static void * pfm_rvmalloc(unsigned long size) { void *mem; unsigned long adr, page; mem=vmalloc(size); if (mem) { //printk("perfmon: CPU%d pfm_rvmalloc(%ld)=%p\n", smp_processor_id(), size, mem); memset(mem, 0, size); /* Clear the ram out, no junk to the user */ adr=(unsigned long) mem; while (size > 0) { page = pfm_kvirt_to_pa(adr); mem_map_reserve(virt_to_page(__va(page))); adr += PAGE_SIZE; size -= PAGE_SIZE; } } return mem; } static void pfm_rvfree(void *mem, unsigned long size) { unsigned long adr, page = 0; if (mem) { adr=(unsigned long) mem; while (size > 0) { page = pfm_kvirt_to_pa(adr); mem_map_unreserve(virt_to_page(__va(page))); adr+=PAGE_SIZE; size-=PAGE_SIZE; } vfree(mem); } return; } /* * This function gets called from mm/mmap.c:exit_mmap() only when there is a sampling buffer * attached to the context AND the current task has a mapping for it, i.e., it is the original * creator of the context. * * This function is used to remember the fact that the vma describing the sampling buffer * has now been removed. It can only be called when no other tasks share the same mm context. * */ static void pfm_vm_close(struct vm_area_struct *vma) { pfm_smpl_buffer_desc_t *psb = (pfm_smpl_buffer_desc_t *)vma->vm_private_data; if (psb == NULL) { printk("perfmon: psb is null in [%d]\n", current->pid); return; } /* * Add PSB to list of buffers to free on release_thread() when no more users * * This call is safe because, once the count is zero is cannot be modified anymore. * This is not because there is no more user of the mm context, that the sampling * buffer is not being used anymore outside of this task. In fact, it can still * be accessed from within the kernel by another task (such as the monitored task). * * Therefore, we only move the psb into the list of buffers to free when we know * nobody else is using it. * The linked list if independent of the perfmon context, because in the case of * multi-threaded processes, the last thread may not have been involved with * monitoring however it will be the one removing the vma and it should therefore * also remove the sampling buffer. This buffer cannot be removed until the vma * is removed. * * This function cannot remove the buffer from here, because exit_mmap() must first * complete. Given that there is no other vma related callback in the generic code, * we have created on own with the linked list of sampling buffer to free which * is part of the thread structure. In release_thread() we check if the list is * empty. If not we call into perfmon to free the buffer and psb. That is the only * way to ensure a safe deallocation of the sampling buffer which works when * the buffer is shared between distinct processes or with multi-threaded programs. * * We need to lock the psb because the refcnt test and flag manipulation must * looked like an atomic operation vis a vis pfm_context_exit() */ LOCK_PSB(psb); if (psb->psb_refcnt == 0) { psb->psb_next = current->thread.pfm_smpl_buf_list; current->thread.pfm_smpl_buf_list = psb; DBprintk(("psb for [%d] smpl @%p size %ld inserted into list\n", current->pid, psb->psb_hdr, psb->psb_size)); } DBprintk(("psb vma flag cleared for [%d] smpl @%p size %ld inserted into list\n", current->pid, psb->psb_hdr, psb->psb_size)); /* * indicate to pfm_context_exit() that the vma has been removed. */ psb->psb_flags &= ~PFM_PSB_VMA; UNLOCK_PSB(psb); } /* * This function is called from pfm_destroy_context() and also from pfm_inherit() * to explicitely remove the sampling buffer mapping from the user level address space. */ static int pfm_remove_smpl_mapping(struct task_struct *task) { pfm_context_t *ctx = task->thread.pfm_context; pfm_smpl_buffer_desc_t *psb; int r; /* * some sanity checks first */ if (ctx == NULL || task->mm == NULL || ctx->ctx_smpl_vaddr == 0 || ctx->ctx_psb == NULL) { printk("perfmon: invalid context mm=%p\n", task->mm); return -1; } psb = ctx->ctx_psb; down_write(&task->mm->mmap_sem); r = do_munmap(task->mm, ctx->ctx_smpl_vaddr, psb->psb_size); up_write(&task->mm->mmap_sem); if (r !=0) { printk("perfmon: pid %d unable to unmap sampling buffer @0x%lx size=%ld\n", task->pid, ctx->ctx_smpl_vaddr, psb->psb_size); } DBprintk(("[%d] do_unmap(0x%lx, %ld)=%d\n", task->pid, ctx->ctx_smpl_vaddr, psb->psb_size, r)); /* * make sure we suppress all traces of this buffer * (important for pfm_inherit) */ ctx->ctx_smpl_vaddr = 0; return 0; } static pfm_context_t * pfm_context_alloc(void) { pfm_context_t *ctx; /* allocate context descriptor */ ctx = kmalloc(sizeof(pfm_context_t), GFP_KERNEL); if (ctx) memset(ctx, 0, sizeof(pfm_context_t)); return ctx; } static void pfm_context_free(pfm_context_t *ctx) { if (ctx) kfree(ctx); } static int pfm_remap_buffer(unsigned long buf, unsigned long addr, unsigned long size) { unsigned long page; DBprintk(("CPU%d buf=0x%lx addr=0x%lx size=%ld\n", smp_processor_id(), buf, addr, size)); while (size > 0) { page = pfm_kvirt_to_pa(buf); if (remap_page_range(addr, page, PAGE_SIZE, PAGE_SHARED)) return -ENOMEM; addr += PAGE_SIZE; buf += PAGE_SIZE; size -= PAGE_SIZE; } return 0; } /* * counts the number of PMDS to save per entry. * This code is generic enough to accomodate more than 64 PMDS when they become available */ static unsigned long pfm_smpl_entry_size(unsigned long *which, unsigned long size) { unsigned long res = 0; int i; for (i=0; i < size; i++, which++) res += hweight64(*which); DBprintk(("weight=%ld\n", res)); return res; } /* * Allocates the sampling buffer and remaps it into caller's address space */ static int pfm_smpl_buffer_alloc(pfm_context_t *ctx, unsigned long *which_pmds, unsigned long entries, void **user_vaddr) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; unsigned long size, regcount; void *smpl_buf; pfm_smpl_buffer_desc_t *psb; regcount = pfm_smpl_entry_size(which_pmds, 1); /* note that regcount might be 0, in this case only the header for each * entry will be recorded. */ /* * 1 buffer hdr and for each entry a header + regcount PMDs to save */ size = PAGE_ALIGN( sizeof(perfmon_smpl_hdr_t) + entries * (sizeof(perfmon_smpl_entry_t) + regcount*sizeof(u64))); /* * check requested size to avoid Denial-of-service attacks * XXX: may have to refine this test * Check against address space limit. * * if ((mm->total_vm << PAGE_SHIFT) + len> current->rlim[RLIMIT_AS].rlim_cur) * return -ENOMEM; */ if (size > current->rlim[RLIMIT_MEMLOCK].rlim_cur) return -EAGAIN; /* * We do the easy to undo allocations first. * * pfm_rvmalloc(), clears the buffer, so there is no leak */ smpl_buf = pfm_rvmalloc(size); if (smpl_buf == NULL) { DBprintk(("Can't allocate sampling buffer\n")); return -ENOMEM; } DBprintk(("smpl_buf @%p\n", smpl_buf)); /* allocate sampling buffer descriptor now */ psb = kmalloc(sizeof(*psb), GFP_KERNEL); if (psb == NULL) { DBprintk(("Can't allocate sampling buffer descriptor\n")); pfm_rvfree(smpl_buf, size); return -ENOMEM; } /* allocate vma */ vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); if (!vma) { DBprintk(("Cannot allocate vma\n")); goto error; } /* * partially initialize the vma for the sampling buffer */ vma->vm_mm = mm; vma->vm_flags = VM_READ| VM_MAYREAD |VM_RESERVED; vma->vm_page_prot = PAGE_READONLY; /* XXX may need to change */ vma->vm_ops = &pfm_vm_ops; /* necesarry to get the close() callback */ vma->vm_pgoff = 0; vma->vm_file = NULL; vma->vm_raend = 0; vma->vm_private_data = psb; /* information needed by the pfm_vm_close() function */ /* * Now we have everything we need and we can initialize * and connect all the data structures */ psb->psb_hdr = smpl_buf; psb->psb_addr = ((char *)smpl_buf)+sizeof(perfmon_smpl_hdr_t); /* first entry */ psb->psb_size = size; /* aligned size */ psb->psb_index = 0; psb->psb_entries = entries; psb->psb_flags = PFM_PSB_VMA; /* remember that there is a vma describing the buffer */ psb->psb_refcnt = 1; spin_lock_init(&psb->psb_lock); /* * XXX: will need to do cacheline alignment to avoid false sharing in SMP mode and * multitask monitoring. */ psb->psb_entry_size = sizeof(perfmon_smpl_entry_t) + regcount*sizeof(u64); DBprintk(("psb @%p entry_size=%ld hdr=%p addr=%p\n", (void *)psb,psb->psb_entry_size, (void *)psb->psb_hdr, (void *)psb->psb_addr)); /* initialize some of the fields of user visible buffer header */ psb->psb_hdr->hdr_version = PFM_SMPL_VERSION; psb->psb_hdr->hdr_entry_size = psb->psb_entry_size; psb->psb_hdr->hdr_pmds[0] = which_pmds[0]; /* * Let's do the difficult operations next. * * now we atomically find some area in the address space and * remap the buffer in it. */ down_write(¤t->mm->mmap_sem); /* find some free area in address space, must have mmap sem held */ vma->vm_start = get_unmapped_area(NULL, 0, size, 0, MAP_PRIVATE|MAP_ANONYMOUS); if (vma->vm_start == 0UL) { DBprintk(("Cannot find unmapped area for size %ld\n", size)); up_write(¤t->mm->mmap_sem); goto error; } vma->vm_end = vma->vm_start + size; DBprintk(("entries=%ld aligned size=%ld, unmapped @0x%lx\n", entries, size, vma->vm_start)); /* can only be applied to current, need to have the mm semaphore held when called */ if (pfm_remap_buffer((unsigned long)smpl_buf, vma->vm_start, size)) { DBprintk(("Can't remap buffer\n")); up_write(¤t->mm->mmap_sem); goto error; } /* * now insert the vma in the vm list for the process, must be * done with mmap lock held */ insert_vm_struct(mm, vma); mm->total_vm += size >> PAGE_SHIFT; up_write(¤t->mm->mmap_sem); /* store which PMDS to record */ ctx->ctx_smpl_regs[0] = which_pmds[0]; /* link to perfmon context */ ctx->ctx_psb = psb; /* * keep track of user level virtual address */ ctx->ctx_smpl_vaddr = *(unsigned long *)user_vaddr = vma->vm_start; return 0; error: pfm_rvfree(smpl_buf, size); kfree(psb); return -ENOMEM; } /* * XXX: do something better here */ static int pfm_bad_permissions(struct task_struct *task) { /* stolen from bad_signal() */ return (current->session != task->session) && (current->euid ^ task->suid) && (current->euid ^ task->uid) && (current->uid ^ task->suid) && (current->uid ^ task->uid); } static int pfx_is_sane(struct task_struct *task, pfarg_context_t *pfx) { int ctx_flags; int cpu; /* valid signal */ /* cannot send to process 1, 0 means do not notify */ if (pfx->ctx_notify_pid == 1) { DBprintk(("invalid notify_pid %d\n", pfx->ctx_notify_pid)); return -EINVAL; } ctx_flags = pfx->ctx_flags; if (ctx_flags & PFM_FL_SYSTEM_WIDE) { DBprintk(("cpu_mask=0x%lx\n", pfx->ctx_cpu_mask)); /* * cannot block in this mode */ if (ctx_flags & PFM_FL_NOTIFY_BLOCK) { DBprintk(("cannot use blocking mode when in system wide monitoring\n")); return -EINVAL; } /* * must only have one bit set in the CPU mask */ if (hweight64(pfx->ctx_cpu_mask) != 1UL) { DBprintk(("invalid CPU mask specified\n")); return -EINVAL; } /* * and it must be a valid CPU */ cpu = ffs(pfx->ctx_cpu_mask); if (cpu > smp_num_cpus) { DBprintk(("CPU%d is not online\n", cpu)); return -EINVAL; } /* * check for pre-existing pinning, if conflicting reject */ if (task->cpus_allowed != ~0UL && (task->cpus_allowed & (1UL<pid, task->cpus_allowed, cpu)); return -EINVAL; } } else { /* * must provide a target for the signal in blocking mode even when * no counter is configured with PFM_FL_REG_OVFL_NOTIFY */ if ((ctx_flags & PFM_FL_NOTIFY_BLOCK) && pfx->ctx_notify_pid == 0) return -EINVAL; } /* probably more to add here */ return 0; } static int pfm_create_context(struct task_struct *task, pfm_context_t *ctx, void *req, int count, struct pt_regs *regs) { pfarg_context_t tmp; void *uaddr = NULL; int ret, cpu = 0; int ctx_flags; pid_t notify_pid; /* a context has already been defined */ if (ctx) return -EBUSY; /* * not yet supported */ if (task != current) return -EINVAL; if (copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT; ret = pfx_is_sane(task, &tmp); if (ret < 0) return ret; ctx_flags = tmp.ctx_flags; ret = -EBUSY; LOCK_PFS(); if (ctx_flags & PFM_FL_SYSTEM_WIDE) { /* at this point, we know there is at least one bit set */ cpu = ffs(tmp.ctx_cpu_mask) - 1; DBprintk(("requesting CPU%d currently on CPU%d\n",cpu, smp_processor_id())); if (pfm_sessions.pfs_task_sessions > 0) { DBprintk(("system wide not possible, task_sessions=%ld\n", pfm_sessions.pfs_task_sessions)); goto abort; } if (pfm_sessions.pfs_sys_session[cpu]) { DBprintk(("system wide not possible, conflicting session [%d] on CPU%d\n",pfm_sessions.pfs_sys_session[cpu]->pid, cpu)); goto abort; } pfm_sessions.pfs_sys_session[cpu] = task; /* * count the number of system wide sessions */ pfm_sessions.pfs_sys_sessions++; } else if (pfm_sessions.pfs_sys_sessions == 0) { pfm_sessions.pfs_task_sessions++; } else { /* no per-process monitoring while there is a system wide session */ goto abort; } UNLOCK_PFS(); ret = -ENOMEM; ctx = pfm_context_alloc(); if (!ctx) goto error; /* record the creator (important for inheritance) */ ctx->ctx_owner = current; notify_pid = tmp.ctx_notify_pid; spin_lock_init(&ctx->ctx_lock); if (notify_pid == current->pid) { ctx->ctx_notify_task = task = current; current->thread.pfm_context = ctx; } else if (notify_pid!=0) { struct task_struct *notify_task; read_lock(&tasklist_lock); notify_task = find_task_by_pid(notify_pid); if (notify_task) { ret = -EPERM; /* * check if we can send this task a signal */ if (pfm_bad_permissions(notify_task)) goto buffer_error; /* * make visible * must be done inside critical section * * if the initialization does not go through it is still * okay because child will do the scan for nothing which * won't hurt. */ current->thread.pfm_context = ctx; /* * will cause task to check on exit for monitored * processes that would notify it. see release_thread() * Note: the scan MUST be done in release thread, once the * task has been detached from the tasklist otherwise you are * exposed to race conditions. */ atomic_add(1, &ctx->ctx_notify_task->thread.pfm_notifiers_check); ctx->ctx_notify_task = notify_task; } read_unlock(&tasklist_lock); } /* * notification process does not exist */ if (notify_pid != 0 && ctx->ctx_notify_task == NULL) { ret = -EINVAL; goto buffer_error; } if (tmp.ctx_smpl_entries) { DBprintk(("sampling entries=%ld\n",tmp.ctx_smpl_entries)); ret = pfm_smpl_buffer_alloc(ctx, tmp.ctx_smpl_regs, tmp.ctx_smpl_entries, &uaddr); if (ret<0) goto buffer_error; tmp.ctx_smpl_vaddr = uaddr; } /* initialization of context's flags */ ctx->ctx_fl_inherit = ctx_flags & PFM_FL_INHERIT_MASK; ctx->ctx_fl_block = (ctx_flags & PFM_FL_NOTIFY_BLOCK) ? 1 : 0; ctx->ctx_fl_system = (ctx_flags & PFM_FL_SYSTEM_WIDE) ? 1: 0; ctx->ctx_fl_frozen = 0; /* * setting this flag to 0 here means, that the creator or the task that the * context is being attached are granted access. Given that a context can only * be created for the calling process this, in effect only allows the creator * to access the context. See pfm_protect() for more. */ ctx->ctx_fl_protected = 0; /* for system wide mode only (only 1 bit set) */ ctx->ctx_cpu = cpu; atomic_set(&ctx->ctx_last_cpu,-1); /* SMP only, means no CPU */ /* * Keep track of the pmds we want to sample * XXX: may be we don't need to save/restore the DEAR/IEAR pmds * but we do need the BTB for sure. This is because of a hardware * buffer of 1 only for non-BTB pmds. * * We ignore the unimplemented pmds specified by the user */ ctx->ctx_used_pmds[0] = tmp.ctx_smpl_regs[0] & pmu_conf.impl_regs[4]; ctx->ctx_saved_pmcs[0] = 1; /* always save/restore PMC[0] */ sema_init(&ctx->ctx_restart_sem, 0); /* init this semaphore to locked */ if (copy_to_user(req, &tmp, sizeof(tmp))) { ret = -EFAULT; goto buffer_error; } DBprintk(("context=%p, pid=%d notify_task=%p\n", (void *)ctx, task->pid, ctx->ctx_notify_task)); DBprintk(("context=%p, pid=%d flags=0x%x inherit=%d block=%d system=%d\n", (void *)ctx, task->pid, ctx_flags, ctx->ctx_fl_inherit, ctx->ctx_fl_block, ctx->ctx_fl_system)); /* * when no notification is required, we can make this visible at the last moment */ if (notify_pid == 0) task->thread.pfm_context = ctx; /* * pin task to CPU and force reschedule on exit to ensure * that when back to user level the task runs on the designated * CPU. */ if (ctx->ctx_fl_system) { ctx->ctx_saved_cpus_allowed = task->cpus_allowed; task->cpus_allowed = 1UL << cpu; task->need_resched = 1; DBprintk(("[%d] rescheduled allowed=0x%lx\n", task->pid,task->cpus_allowed)); } return 0; buffer_error: pfm_context_free(ctx); error: /* * undo session reservation */ LOCK_PFS(); if (ctx_flags & PFM_FL_SYSTEM_WIDE) { pfm_sessions.pfs_sys_session[cpu] = NULL; pfm_sessions.pfs_sys_sessions--; } else { pfm_sessions.pfs_task_sessions--; } abort: UNLOCK_PFS(); return ret; } static void pfm_reset_regs(pfm_context_t *ctx, unsigned long *ovfl_regs, int flag) { unsigned long mask = ovfl_regs[0]; unsigned long reset_others = 0UL; unsigned long val; int i; DBprintk(("masks=0x%lx\n", mask)); /* * now restore reset value on sampling overflowed counters */ mask >>= PMU_FIRST_COUNTER; for(i = PMU_FIRST_COUNTER; mask; i++, mask >>= 1) { if (mask & 0x1) { val = flag == PFM_RELOAD_LONG_RESET ? ctx->ctx_soft_pmds[i].long_reset: ctx->ctx_soft_pmds[i].short_reset; reset_others |= ctx->ctx_soft_pmds[i].reset_pmds[0]; DBprintk(("[%d] %s reset soft_pmd[%d]=%lx\n", current->pid, flag == PFM_RELOAD_LONG_RESET ? "long" : "short", i, val)); /* upper part is ignored on rval */ pfm_write_soft_counter(ctx, i, val); } } /* * Now take care of resetting the other registers */ for(i = 0; reset_others; i++, reset_others >>= 1) { if ((reset_others & 0x1) == 0) continue; val = flag == PFM_RELOAD_LONG_RESET ? ctx->ctx_soft_pmds[i].long_reset: ctx->ctx_soft_pmds[i].short_reset; if (PMD_IS_COUNTING(i)) { pfm_write_soft_counter(ctx, i, val); } else { ia64_set_pmd(i, val); } DBprintk(("[%d] %s reset_others pmd[%d]=%lx\n", current->pid, flag == PFM_RELOAD_LONG_RESET ? "long" : "short", i, val)); } /* just in case ! */ ctx->ctx_ovfl_regs[0] = 0UL; } static int pfm_write_pmcs(struct task_struct *ta, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) { struct thread_struct *th = &ta->thread; pfarg_reg_t tmp, *req = (pfarg_reg_t *)arg; unsigned int cnum; int i; int ret = 0, reg_retval = 0; /* we don't quite support this right now */ if (ta != current) return -EINVAL; if (!CTX_IS_ENABLED(ctx)) return -EINVAL; /* XXX: ctx locking may be required here */ for (i = 0; i < count; i++, req++) { if (copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT; cnum = tmp.reg_num; /* * we reject all non implemented PMC as well * as attempts to modify PMC[0-3] which are used * as status registers by the PMU */ if (!PMC_IS_IMPL(cnum) || cnum < 4) { DBprintk(("pmc[%u] is unimplemented or invalid\n", cnum)); ret = -EINVAL; goto abort_mission; } /* * A PMC used to configure monitors must be: * - system-wide session: privileged monitor * - per-task : user monitor * any other configuration is rejected. */ if (PMC_IS_MONITOR(cnum)) { pfm_monitor_t *p = (pfm_monitor_t *)&tmp.reg_value; DBprintk(("pmc[%u].pm = %d\n", cnum, p->pmc_pm)); if (ctx->ctx_fl_system ^ p->pmc_pm) { //if ((ctx->ctx_fl_system == 1 && p->pmc_pm == 0) // ||(ctx->ctx_fl_system == 0 && p->pmc_pm == 1)) { ret = -EINVAL; goto abort_mission; } /* * enforce generation of overflow interrupt. Necessary on all * CPUs which do not implement 64-bit hardware counters. */ p->pmc_oi = 1; } if (PMC_IS_COUNTING(cnum)) { if (tmp.reg_flags & PFM_REGFL_OVFL_NOTIFY) { /* * must have a target for the signal */ if (ctx->ctx_notify_task == NULL) { ret = -EINVAL; goto abort_mission; } ctx->ctx_soft_pmds[cnum].flags |= PFM_REGFL_OVFL_NOTIFY; } /* * copy reset vector */ ctx->ctx_soft_pmds[cnum].reset_pmds[0] = tmp.reg_reset_pmds[0]; ctx->ctx_soft_pmds[cnum].reset_pmds[1] = tmp.reg_reset_pmds[1]; ctx->ctx_soft_pmds[cnum].reset_pmds[2] = tmp.reg_reset_pmds[2]; ctx->ctx_soft_pmds[cnum].reset_pmds[3] = tmp.reg_reset_pmds[3]; /* * needed in case the user does not initialize the equivalent * PMD. Clearing is done in reset_pmu() so there is no possible * leak here. */ CTX_USED_PMD(ctx, cnum); } abort_mission: if (ret == -EINVAL) reg_retval = PFM_REG_RETFL_EINVAL; PFM_REG_RETFLAG_SET(tmp.reg_flags, reg_retval); /* * update register return value, abort all if problem during copy. */ if (copy_to_user(req, &tmp, sizeof(tmp))) return -EFAULT; /* * if there was something wrong on this register, don't touch * the hardware at all and abort write request for others. * * On error, the user mut sequentially scan the table and the first * entry which has a return flag set is the one that caused the error. */ if (ret != 0) { DBprintk(("[%d] pmc[%u]=0x%lx error %d\n", ta->pid, cnum, tmp.reg_value, reg_retval)); break; } /* * We can proceed with this register! */ /* * keep copy the pmc, used for register reload */ th->pmc[cnum] = tmp.reg_value; ia64_set_pmc(cnum, tmp.reg_value); DBprintk(("[%d] pmc[%u]=0x%lx flags=0x%x save_pmcs=0%lx reload_pmcs=0x%lx\n", ta->pid, cnum, tmp.reg_value, ctx->ctx_soft_pmds[cnum].flags, ctx->ctx_saved_pmcs[0], ctx->ctx_reload_pmcs[0])); } return ret; } static int pfm_write_pmds(struct task_struct *ta, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) { pfarg_reg_t tmp, *req = (pfarg_reg_t *)arg; unsigned int cnum; int i; int ret = 0, reg_retval = 0; /* we don't quite support this right now */ if (ta != current) return -EINVAL; /* * Cannot do anything before PMU is enabled */ if (!CTX_IS_ENABLED(ctx)) return -EINVAL; /* XXX: ctx locking may be required here */ for (i = 0; i < count; i++, req++) { if (copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT; cnum = tmp.reg_num; if (!PMD_IS_IMPL(cnum)) { ret = -EINVAL; goto abort_mission; } /* update virtualized (64bits) counter */ if (PMD_IS_COUNTING(cnum)) { ctx->ctx_soft_pmds[cnum].ival = tmp.reg_value; ctx->ctx_soft_pmds[cnum].val = tmp.reg_value & ~pmu_conf.perf_ovfl_val; ctx->ctx_soft_pmds[cnum].long_reset = tmp.reg_long_reset; ctx->ctx_soft_pmds[cnum].short_reset = tmp.reg_short_reset; } abort_mission: if (ret == -EINVAL) reg_retval = PFM_REG_RETFL_EINVAL; PFM_REG_RETFLAG_SET(tmp.reg_flags, reg_retval); if (copy_to_user(req, &tmp, sizeof(tmp))) return -EFAULT; /* * if there was something wrong on this register, don't touch * the hardware at all and abort write request for others. * * On error, the user mut sequentially scan the table and the first * entry which has a return flag set is the one that caused the error. */ if (ret != 0) { DBprintk(("[%d] pmc[%u]=0x%lx error %d\n", ta->pid, cnum, tmp.reg_value, reg_retval)); break; } /* keep track of what we use */ CTX_USED_PMD(ctx, cnum); /* writes to unimplemented part is ignored, so this is safe */ ia64_set_pmd(cnum, tmp.reg_value); /* to go away */ ia64_srlz_d(); DBprintk(("[%d] pmd[%u]: soft_pmd=0x%lx short_reset=0x%lx " "long_reset=0x%lx hw_pmd=%lx notify=%c used_pmds=0x%lx reset_pmds=0x%lx\n", ta->pid, cnum, ctx->ctx_soft_pmds[cnum].val, ctx->ctx_soft_pmds[cnum].short_reset, ctx->ctx_soft_pmds[cnum].long_reset, ia64_get_pmd(cnum) & pmu_conf.perf_ovfl_val, PMC_OVFL_NOTIFY(ctx, cnum) ? 'Y':'N', ctx->ctx_used_pmds[0], ctx->ctx_soft_pmds[cnum].reset_pmds[0])); } return ret; } static int pfm_read_pmds(struct task_struct *ta, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) { struct thread_struct *th = &ta->thread; unsigned long val=0; pfarg_reg_t tmp, *req = (pfarg_reg_t *)arg; int i; if (!CTX_IS_ENABLED(ctx)) return -EINVAL; /* * XXX: MUST MAKE SURE WE DON"T HAVE ANY PENDING OVERFLOW BEFORE READING * This is required when the monitoring has been stoppped by user or kernel. * If it is still going on, then that's fine because we a re not guaranteed * to return an accurate value in this case. */ /* XXX: ctx locking may be required here */ DBprintk(("ctx_last_cpu=%d for [%d]\n", atomic_read(&ctx->ctx_last_cpu), ta->pid)); for (i = 0; i < count; i++, req++) { unsigned long reg_val = ~0UL, ctx_val = ~0UL; if (copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT; if (!PMD_IS_IMPL(tmp.reg_num)) goto abort_mission; /* * If the task is not the current one, then we check if the * PMU state is still in the local live register due to lazy ctxsw. * If true, then we read directly from the registers. */ if (atomic_read(&ctx->ctx_last_cpu) == smp_processor_id()){ ia64_srlz_d(); val = reg_val = ia64_get_pmd(tmp.reg_num); DBprintk(("reading pmd[%u]=0x%lx from hw\n", tmp.reg_num, val)); } else { #ifdef CONFIG_SMP int cpu; /* * for SMP system, the context may still be live on another * CPU so we need to fetch it before proceeding with the read * This call we only be made once for the whole loop because * of ctx_last_cpu becoming == -1. * * We cannot reuse ctx_last_cpu as it may change before we get to the * actual IPI call. In this case, we will do the call for nothing but * there is no way around it. The receiving side will simply do nothing. */ cpu = atomic_read(&ctx->ctx_last_cpu); if (cpu != -1) { DBprintk(("must fetch on CPU%d for [%d]\n", cpu, ta->pid)); pfm_fetch_regs(cpu, ta, ctx); } #endif /* context has been saved */ val = reg_val = th->pmd[tmp.reg_num]; } if (PMD_IS_COUNTING(tmp.reg_num)) { /* * XXX: need to check for overflow */ val &= pmu_conf.perf_ovfl_val; val += ctx_val = ctx->ctx_soft_pmds[tmp.reg_num].val; } else { val = reg_val = ia64_get_pmd(tmp.reg_num); } PFM_REG_RETFLAG_SET(tmp.reg_flags, 0); tmp.reg_value = val; DBprintk(("read pmd[%u] soft_pmd=0x%lx reg=0x%lx pmc=0x%lx\n", tmp.reg_num, ctx_val, reg_val, ia64_get_pmc(tmp.reg_num))); if (copy_to_user(req, &tmp, sizeof(tmp))) return -EFAULT; } return 0; abort_mission: PFM_REG_RETFLAG_SET(tmp.reg_flags, PFM_REG_RETFL_EINVAL); /* * XXX: if this fails, we stick we the original failure, flag not updated! */ copy_to_user(req, &tmp, sizeof(tmp)); return -EINVAL; } #ifdef PFM_PMU_USES_DBR /* * Only call this function when a process it trying to * write the debug registers (reading is always allowed) */ int pfm_use_debug_registers(struct task_struct *task) { pfm_context_t *ctx = task->thread.pfm_context; int ret = 0; DBprintk(("called for [%d]\n", task->pid)); /* * do it only once */ if (task->thread.flags & IA64_THREAD_DBG_VALID) return 0; /* * Even on SMP, we do not need to use an atomic here because * the only way in is via ptrace() and this is possible only when the * process is stopped. Even in the case where the ctxsw out is not totally * completed by the time we come here, there is no way the 'stopped' process * could be in the middle of fiddling with the pfm_write_ibr_dbr() routine. * So this is always safe. */ if (ctx && ctx->ctx_fl_using_dbreg == 1) return -1; /* * XXX: not pretty */ LOCK_PFS(); /* * We only allow the use of debug registers when there is no system * wide monitoring * XXX: we could relax this by */ if (pfm_sessions.pfs_sys_use_dbregs> 0) ret = -1; else pfm_sessions.pfs_ptrace_use_dbregs++; DBprintk(("ptrace_use_dbregs=%lu sys_use_dbregs=%lu by [%d] ret = %d\n", pfm_sessions.pfs_ptrace_use_dbregs, pfm_sessions.pfs_sys_use_dbregs, task->pid, ret)); UNLOCK_PFS(); return ret; } /* * This function is called for every task that exits with the * IA64_THREAD_DBG_VALID set. This indicates a task which was * able to use the debug registers for debugging purposes via * ptrace(). Therefore we know it was not using them for * perfmormance monitoring, so we only decrement the number * of "ptraced" debug register users to keep the count up to date */ int pfm_release_debug_registers(struct task_struct *task) { int ret; LOCK_PFS(); if (pfm_sessions.pfs_ptrace_use_dbregs == 0) { printk("perfmon: invalid release for [%d] ptrace_use_dbregs=0\n", task->pid); ret = -1; } else { pfm_sessions.pfs_ptrace_use_dbregs--; ret = 0; } UNLOCK_PFS(); return ret; } #else /* PFM_PMU_USES_DBR is true */ /* * in case, the PMU does not use the debug registers, these two functions are nops. * The first function is called from arch/ia64/kernel/ptrace.c. * The second function is called from arch/ia64/kernel/process.c. */ int pfm_use_debug_registers(struct task_struct *task) { return 0; } int pfm_release_debug_registers(struct task_struct *task) { return 0; } #endif /* PFM_PMU_USES_DBR */ static int pfm_restart(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) { void *sem = &ctx->ctx_restart_sem; /* * Cannot do anything before PMU is enabled */ if (!CTX_IS_ENABLED(ctx)) return -EINVAL; if (ctx->ctx_fl_frozen==0) { printk("task %d without pmu_frozen set\n", task->pid); return -EINVAL; } if (task == current) { DBprintk(("restarting self %d frozen=%d \n", current->pid, ctx->ctx_fl_frozen)); pfm_reset_regs(ctx, ctx->ctx_ovfl_regs, PFM_RELOAD_LONG_RESET); ctx->ctx_ovfl_regs[0] = 0UL; /* * We ignore block/don't block because we never block * for a self-monitoring process. */ ctx->ctx_fl_frozen = 0; if (CTX_HAS_SMPL(ctx)) { ctx->ctx_psb->psb_hdr->hdr_count = 0; ctx->ctx_psb->psb_index = 0; } /* simply unfreeze */ ia64_set_pmc(0, 0); ia64_srlz_d(); return 0; } /* restart on another task */ /* * if blocking, then post the semaphore. * if non-blocking, then we ensure that the task will go into * pfm_overflow_must_block() before returning to user mode. * We cannot explicitely reset another task, it MUST always * be done by the task itself. This works for system wide because * the tool that is controlling the session is doing "self-monitoring". * * XXX: what if the task never goes back to user? * */ if (CTX_OVFL_NOBLOCK(ctx) == 0) { DBprintk(("unblocking %d \n", task->pid)); up(sem); } else { task->thread.pfm_ovfl_block_reset = 1; } #if 0 /* * in case of non blocking mode, then it's just a matter of * of reseting the sampling buffer (if any) index. The PMU * is already active. */ /* * must reset the header count first */ if (CTX_HAS_SMPL(ctx)) { DBprintk(("resetting sampling indexes for %d \n", task->pid)); ctx->ctx_psb->psb_hdr->hdr_count = 0; ctx->ctx_psb->psb_index = 0; } #endif return 0; } #ifndef CONFIG_SMP /* * On UP kernels, we do not need to constantly set the psr.pp bit * when a task is scheduled. The psr.pp bit can only be changed in * the kernel because of a user request. Given we are on a UP non preeemptive * kernel we know that no other task is running, so we cna simply update their * psr.pp from their saved state. There is this no impact on the context switch * code compared to the SMP case. */ static void pfm_tasklist_toggle_pp(unsigned int val) { struct task_struct *p; struct pt_regs *regs; DBprintk(("invoked by [%d] pp=%u\n", current->pid, val)); read_lock(&tasklist_lock); for_each_task(p) { regs = (struct pt_regs *)((unsigned long) p + IA64_STK_OFFSET); /* * position on pt_regs saved on stack on 1st entry into the kernel */ regs--; /* * update psr.pp */ ia64_psr(regs)->pp = val; } read_unlock(&tasklist_lock); } #endif static int pfm_stop(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) { /* we don't quite support this right now */ if (task != current) return -EINVAL; /* * Cannot do anything before PMU is enabled */ if (!CTX_IS_ENABLED(ctx)) return -EINVAL; DBprintk(("[%d] fl_system=%d owner=%p current=%p\n", current->pid, ctx->ctx_fl_system, PMU_OWNER(), current)); /* simply stop monitoring but not the PMU */ if (ctx->ctx_fl_system) { __asm__ __volatile__ ("rsm psr.pp;;"::: "memory"); /* disable dcr pp */ ia64_set_dcr(ia64_get_dcr() & ~IA64_DCR_PP); #ifdef CONFIG_SMP local_cpu_data->pfm_dcr_pp = 0; #else pfm_tasklist_toggle_pp(0); #endif ia64_psr(regs)->pp = 0; } else { __asm__ __volatile__ ("rum psr.up;;"::: "memory"); ia64_psr(regs)->up = 0; } return 0; } static int pfm_disable(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) { /* we don't quite support this right now */ if (task != current) return -EINVAL; if (!CTX_IS_ENABLED(ctx)) return -EINVAL; /* * stop monitoring, freeze PMU, and save state in context * this call will clear IA64_THREAD_PM_VALID for per-task sessions. */ pfm_flush_regs(task); if (ctx->ctx_fl_system) { ia64_psr(regs)->pp = 0; } else { ia64_psr(regs)->up = 0; } /* * goes back to default behavior * no need to change live psr.sp because useless at the kernel level */ ia64_psr(regs)->sp = 1; DBprintk(("enabling psr.sp for [%d]\n", current->pid)); ctx->ctx_flags.state = PFM_CTX_DISABLED; return 0; } static int pfm_destroy_context(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) { /* we don't quite support this right now */ if (task != current) return -EINVAL; /* * if context was never enabled, then there is not much * to do */ if (!CTX_IS_ENABLED(ctx)) goto skipped_stop; /* * Disable context: stop monitoring, flush regs to software state (useless here), * and freeze PMU * * The IA64_THREAD_PM_VALID is cleared by pfm_flush_regs() called from pfm_disable() */ pfm_disable(task, ctx, arg, count, regs); if (ctx->ctx_fl_system) { ia64_psr(regs)->pp = 0; } else { ia64_psr(regs)->up = 0; } /* restore security level */ ia64_psr(regs)->sp = 1; skipped_stop: /* * remove sampling buffer mapping, if any */ if (ctx->ctx_smpl_vaddr) pfm_remove_smpl_mapping(task); /* now free context and related state */ pfm_context_exit(task); return 0; } /* * does nothing at the moment */ static int pfm_unprotect_context(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) { return 0; } static int pfm_protect_context(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) { DBprintk(("context from [%d] is protected\n", task->pid)); /* * from now on, only the creator of the context has access to it */ ctx->ctx_fl_protected = 1; /* * reinforce secure monitoring: cannot toggle psr.up */ ia64_psr(regs)->sp = 1; return 0; } static int pfm_debug(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) { unsigned int mode = *(unsigned int *)arg; pfm_debug_mode = mode == 0 ? 0 : 1; printk("perfmon debugging %s\n", pfm_debug_mode ? "on" : "off"); return 0; } #ifdef PFM_PMU_USES_DBR typedef struct { unsigned long ibr_mask:56; unsigned long ibr_plm:4; unsigned long ibr_ig:3; unsigned long ibr_x:1; } ibr_mask_reg_t; typedef struct { unsigned long dbr_mask:56; unsigned long dbr_plm:4; unsigned long dbr_ig:2; unsigned long dbr_w:1; unsigned long dbr_r:1; } dbr_mask_reg_t; typedef union { unsigned long val; ibr_mask_reg_t ibr; dbr_mask_reg_t dbr; } dbreg_t; static int pfm_write_ibr_dbr(int mode, struct task_struct *task, void *arg, int count, struct pt_regs *regs) { struct thread_struct *thread = &task->thread; pfm_context_t *ctx = task->thread.pfm_context; pfarg_dbreg_t tmp, *req = (pfarg_dbreg_t *)arg; dbreg_t dbreg; unsigned int rnum; int first_time; int i, ret = 0; /* * for range restriction: psr.db must be cleared or the * the PMU will ignore the debug registers. * * XXX: may need more in system wide mode, * no task can have this bit set? */ if (ia64_psr(regs)->db == 1) return -EINVAL; first_time = ctx->ctx_fl_using_dbreg == 0; /* * check for debug registers in system wide mode * */ LOCK_PFS(); if (ctx->ctx_fl_system && first_time) { if (pfm_sessions.pfs_ptrace_use_dbregs) ret = -EBUSY; else pfm_sessions.pfs_sys_use_dbregs++; } UNLOCK_PFS(); if (ret != 0) return ret; if (ctx->ctx_fl_system) { /* we mark ourselves as owner of the debug registers */ ctx->ctx_fl_using_dbreg = 1; } else { if (ctx->ctx_fl_using_dbreg == 0) { ret= -EBUSY; if ((thread->flags & IA64_THREAD_DBG_VALID) != 0) { DBprintk(("debug registers already in use for [%d]\n", task->pid)); goto abort_mission; } /* we mark ourselves as owner of the debug registers */ ctx->ctx_fl_using_dbreg = 1; /* * Given debug registers cannot be used for both debugging * and performance monitoring at the same time, we reuse * the storage area to save and restore the registers on ctxsw. */ memset(task->thread.dbr, 0, sizeof(task->thread.dbr)); memset(task->thread.ibr, 0, sizeof(task->thread.ibr)); /* * clear hardware registers to make sure we don't leak * information and pick up stale state */ for (i=0; i < pmu_conf.num_ibrs; i++) { ia64_set_ibr(i, 0UL); } for (i=0; i < pmu_conf.num_dbrs; i++) { ia64_set_dbr(i, 0UL); } } } ret = -EFAULT; /* * Now install the values into the registers */ for (i = 0; i < count; i++, req++) { if (copy_from_user(&tmp, req, sizeof(tmp))) goto abort_mission; rnum = tmp.dbreg_num; dbreg.val = tmp.dbreg_value; ret = -EINVAL; if ((mode == 0 && !IBR_IS_IMPL(rnum)) || ((mode == 1) && !DBR_IS_IMPL(rnum))) { DBprintk(("invalid register %u val=0x%lx mode=%d i=%d count=%d\n", rnum, dbreg.val, mode, i, count)); goto abort_mission; } /* * make sure we do not install enabled breakpoint */ if (rnum & 0x1) { if (mode == 0) dbreg.ibr.ibr_x = 0; else dbreg.dbr.dbr_r = dbreg.dbr.dbr_w = 0; } /* * clear return flags and copy back to user * * XXX: fix once EAGAIN is implemented */ ret = -EFAULT; PFM_REG_RETFLAG_SET(tmp.dbreg_flags, 0); if (copy_to_user(req, &tmp, sizeof(tmp))) goto abort_mission; /* * Debug registers, just like PMC, can only be modified * by a kernel call. Moreover, perfmon() access to those * registers are centralized in this routine. The hardware * does not modify the value of these registers, therefore, * if we save them as they are written, we can avoid having * to save them on context switch out. This is made possible * by the fact that when perfmon uses debug registers, ptrace() * won't be able to modify them concurrently. */ if (mode == 0) { CTX_USED_IBR(ctx, rnum); ia64_set_ibr(rnum, dbreg.val); thread->ibr[rnum] = dbreg.val; DBprintk(("write ibr%u=0x%lx used_ibrs=0x%lx\n", rnum, dbreg.val, ctx->ctx_used_ibrs[0])); } else { CTX_USED_DBR(ctx, rnum); ia64_set_dbr(rnum, dbreg.val); thread->dbr[rnum] = dbreg.val; DBprintk(("write dbr%u=0x%lx used_dbrs=0x%lx\n", rnum, dbreg.val, ctx->ctx_used_dbrs[0])); } } return 0; abort_mission: /* * in case it was our first attempt, we undo the global modifications */ if (first_time) { LOCK_PFS(); if (ctx->ctx_fl_system) { pfm_sessions.pfs_sys_use_dbregs--; } UNLOCK_PFS(); ctx->ctx_fl_using_dbreg = 0; } /* * install error return flag */ if (ret != -EFAULT) { /* * XXX: for now we can only come here on EINVAL */ PFM_REG_RETFLAG_SET(tmp.dbreg_flags, PFM_REG_RETFL_EINVAL); copy_to_user(req, &tmp, sizeof(tmp)); } return ret; } static int pfm_write_ibrs(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) { /* we don't quite support this right now */ if (task != current) return -EINVAL; if (!CTX_IS_ENABLED(ctx)) return -EINVAL; return pfm_write_ibr_dbr(0, task, arg, count, regs); } static int pfm_write_dbrs(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) { /* we don't quite support this right now */ if (task != current) return -EINVAL; if (!CTX_IS_ENABLED(ctx)) return -EINVAL; return pfm_write_ibr_dbr(1, task, arg, count, regs); } #endif /* PFM_PMU_USES_DBR */ static int pfm_get_features(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) { pfarg_features_t tmp; memset(&tmp, 0, sizeof(tmp)); tmp.ft_version = PFM_VERSION; tmp.ft_smpl_version = PFM_SMPL_VERSION; if (copy_to_user(arg, &tmp, sizeof(tmp))) return -EFAULT; return 0; } static int pfm_start(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) { /* we don't quite support this right now */ if (task != current) return -EINVAL; /* * Cannot do anything before PMU is enabled */ if (!CTX_IS_ENABLED(ctx)) return -EINVAL; DBprintk(("[%d] fl_system=%d owner=%p current=%p\n", current->pid, ctx->ctx_fl_system, PMU_OWNER(), current)); if (PMU_OWNER() != task) { printk("perfmon: pfm_start task [%d] not pmu owner\n", task->pid); return -EINVAL; } if (ctx->ctx_fl_system) { /* enable dcr pp */ ia64_set_dcr(ia64_get_dcr()|IA64_DCR_PP); #ifdef CONFIG_SMP local_cpu_data->pfm_dcr_pp = 1; #else pfm_tasklist_toggle_pp(1); #endif ia64_psr(regs)->pp = 1; __asm__ __volatile__ ("ssm psr.pp;;"::: "memory"); } else { if ((task->thread.flags & IA64_THREAD_PM_VALID) == 0) { printk("perfmon: pfm_start task flag not set for [%d]\n", task->pid); return -EINVAL; } ia64_psr(regs)->up = 1; __asm__ __volatile__ ("sum psr.up;;"::: "memory"); } ia64_srlz_d(); return 0; } static int pfm_enable(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) { /* we don't quite support this right now */ if (task != current) return -EINVAL; if (ctx->ctx_fl_system == 0 && PMU_OWNER() && PMU_OWNER() != current) pfm_lazy_save_regs(PMU_OWNER()); /* reset all registers to stable quiet state */ ia64_reset_pmu(task); /* make sure nothing starts */ if (ctx->ctx_fl_system) { ia64_psr(regs)->pp = 0; ia64_psr(regs)->up = 0; /* just to make sure! */ __asm__ __volatile__ ("rsm psr.pp;;"::: "memory"); #ifdef CONFIG_SMP local_cpu_data->pfm_syst_wide = 1; local_cpu_data->pfm_dcr_pp = 0; #endif } else { /* * needed in case the task was a passive task during * a system wide session and now wants to have its own * session */ ia64_psr(regs)->pp = 0; /* just to make sure! */ ia64_psr(regs)->up = 0; __asm__ __volatile__ ("rum psr.up;;"::: "memory"); /* * allow user control (user monitors only) if (task == ctx->ctx_owner) { */ { DBprintk(("clearing psr.sp for [%d]\n", current->pid)); ia64_psr(regs)->sp = 0; } task->thread.flags |= IA64_THREAD_PM_VALID; } SET_PMU_OWNER(task); ctx->ctx_flags.state = PFM_CTX_ENABLED; atomic_set(&ctx->ctx_last_cpu, smp_processor_id()); /* simply unfreeze */ ia64_set_pmc(0, 0); ia64_srlz_d(); return 0; } /* * functions MUST be listed in the increasing order of their index (see permfon.h) */ static pfm_cmd_desc_t pfm_cmd_tab[]={ /* 0 */{ NULL, 0, 0, 0}, /* not used */ /* 1 */{ pfm_write_pmcs, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_READ|PFM_CMD_ARG_WRITE, PFM_CMD_ARG_MANY, sizeof(pfarg_reg_t)}, /* 2 */{ pfm_write_pmds, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_READ, PFM_CMD_ARG_MANY, sizeof(pfarg_reg_t)}, /* 3 */{ pfm_read_pmds, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_READ|PFM_CMD_ARG_WRITE, PFM_CMD_ARG_MANY, sizeof(pfarg_reg_t)}, /* 4 */{ pfm_stop, PFM_CMD_PID|PFM_CMD_CTX, 0, 0}, /* 5 */{ pfm_start, PFM_CMD_PID|PFM_CMD_CTX, 0, 0}, /* 6 */{ pfm_enable, PFM_CMD_PID|PFM_CMD_CTX, 0, 0}, /* 7 */{ pfm_disable, PFM_CMD_PID|PFM_CMD_CTX, 0, 0}, /* 8 */{ pfm_create_context, PFM_CMD_ARG_READ, 1, sizeof(pfarg_context_t)}, /* 9 */{ pfm_destroy_context, PFM_CMD_PID|PFM_CMD_CTX, 0, 0}, /* 10 */{ pfm_restart, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_NOCHK, 0, 0}, /* 11 */{ pfm_protect_context, PFM_CMD_PID|PFM_CMD_CTX, 0, 0}, /* 12 */{ pfm_get_features, PFM_CMD_ARG_WRITE, 0, 0}, /* 13 */{ pfm_debug, 0, 1, sizeof(unsigned int)}, /* 14 */{ pfm_unprotect_context, PFM_CMD_PID|PFM_CMD_CTX, 0, 0}, /* 15 */{ NULL, 0, 0, 0}, /* not used */ /* 16 */{ NULL, 0, 0, 0}, /* not used */ /* 17 */{ NULL, 0, 0, 0}, /* not used */ /* 18 */{ NULL, 0, 0, 0}, /* not used */ /* 19 */{ NULL, 0, 0, 0}, /* not used */ /* 20 */{ NULL, 0, 0, 0}, /* not used */ /* 21 */{ NULL, 0, 0, 0}, /* not used */ /* 22 */{ NULL, 0, 0, 0}, /* not used */ /* 23 */{ NULL, 0, 0, 0}, /* not used */ /* 24 */{ NULL, 0, 0, 0}, /* not used */ /* 25 */{ NULL, 0, 0, 0}, /* not used */ /* 26 */{ NULL, 0, 0, 0}, /* not used */ /* 27 */{ NULL, 0, 0, 0}, /* not used */ /* 28 */{ NULL, 0, 0, 0}, /* not used */ /* 29 */{ NULL, 0, 0, 0}, /* not used */ /* 30 */{ NULL, 0, 0, 0}, /* not used */ /* 31 */{ NULL, 0, 0, 0}, /* not used */ #ifdef PFM_PMU_USES_DBR /* 32 */{ pfm_write_ibrs, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_READ|PFM_CMD_ARG_WRITE, PFM_CMD_ARG_MANY, sizeof(pfarg_dbreg_t)}, /* 33 */{ pfm_write_dbrs, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_READ|PFM_CMD_ARG_WRITE, PFM_CMD_ARG_MANY, sizeof(pfarg_dbreg_t)} #endif }; #define PFM_CMD_COUNT (sizeof(pfm_cmd_tab)/sizeof(pfm_cmd_desc_t)) static int check_task_state(struct task_struct *task) { int ret = 0; #ifdef CONFIG_SMP /* We must wait until the state has been completely * saved. There can be situations where the reader arrives before * after the task is marked as STOPPED but before pfm_save_regs() * is completed. */ for (;;) { task_lock(task); if (!task_has_cpu(task)) break; task_unlock(task); do { if (task->state != TASK_ZOMBIE && task->state != TASK_STOPPED) return -EBUSY; barrier(); cpu_relax(); } while (task_has_cpu(task)); } task_unlock(task); #else if (task->state != TASK_ZOMBIE && task->state != TASK_STOPPED) { DBprintk(("warning [%d] not in stable state %ld\n", task->pid, task->state)); ret = -EBUSY; } #endif return ret; } asmlinkage int sys_perfmonctl (pid_t pid, int cmd, void *arg, int count, long arg5, long arg6, long arg7, long arg8, long stack) { struct pt_regs *regs = (struct pt_regs *)&stack; struct task_struct *task = current; pfm_context_t *ctx = task->thread.pfm_context; size_t sz; int ret = -ESRCH, narg; /* * reject any call if perfmon was disabled at initialization time */ if (PFM_IS_DISABLED()) return -ENOSYS; DBprintk(("cmd=%d idx=%d valid=%d narg=0x%x\n", cmd, PFM_CMD_IDX(cmd), PFM_CMD_IS_VALID(cmd), PFM_CMD_NARG(cmd))); if (PFM_CMD_IS_VALID(cmd) == 0) return -EINVAL; /* ingore arguments when command has none */ narg = PFM_CMD_NARG(cmd); if ((narg == PFM_CMD_ARG_MANY && count == 0) || (narg > 0 && narg != count)) return -EINVAL; sz = PFM_CMD_ARG_SIZE(cmd); if (PFM_CMD_READ_ARG(cmd) && !access_ok(VERIFY_READ, arg, sz*count)) return -EFAULT; if (PFM_CMD_WRITE_ARG(cmd) && !access_ok(VERIFY_WRITE, arg, sz*count)) return -EFAULT; if (PFM_CMD_USE_PID(cmd)) { /* * XXX: may need to fine tune this one */ if (pid < 2) return -EPERM; if (pid != current->pid) { read_lock(&tasklist_lock); task = find_task_by_pid(pid); if (!task) goto abort_call; ret = -EPERM; if (pfm_bad_permissions(task)) goto abort_call; if (PFM_CMD_CHK(cmd)) { ret = check_task_state(task); if (ret != 0) goto abort_call; } ctx = task->thread.pfm_context; } } if (PFM_CMD_USE_CTX(cmd)) { ret = -EINVAL; if (ctx == NULL) { DBprintk(("no context for task %d\n", task->pid)); goto abort_call; } ret = -EPERM; /* * we only grant access to the context if: * - the caller is the creator of the context (ctx_owner) * OR - the context is attached to the caller AND The context IS NOT * in protected mode */ if (ctx->ctx_owner != current && (ctx->ctx_fl_protected || task != current)) { DBprintk(("context protected, no access for [%d]\n", task->pid)); goto abort_call; } } ret = (*pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_func)(task, ctx, arg, count, regs); abort_call: if (task != current) read_unlock(&tasklist_lock); return ret; } #if __GNUC__ >= 3 void asmlinkage pfm_ovfl_block_reset(u64 arg0, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, long info) #else void asmlinkage pfm_ovfl_block_reset(u64 arg0, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, long info) #endif { struct thread_struct *th = ¤t->thread; pfm_context_t *ctx = current->thread.pfm_context; int ret; /* * clear the flag, to make sure we won't get here * again */ th->pfm_ovfl_block_reset = 0; /* * do some sanity checks first */ if (!ctx) { printk("perfmon: [%d] has no PFM context\n", current->pid); return; } if (CTX_OVFL_NOBLOCK(ctx)) goto non_blocking; DBprintk(("[%d] before sleeping\n", current->pid)); /* * may go through without blocking on SMP systems * if restart has been received already by the time we call down() */ ret = down_interruptible(&ctx->ctx_restart_sem); DBprintk(("[%d] after sleeping ret=%d\n", current->pid, ret)); /* * in case of interruption of down() we don't restart anything */ if (ret >= 0) { non_blocking: /* we reactivate on context switch */ ctx->ctx_fl_frozen = 0; /* * the ovfl_sem is cleared by the restart task and this is safe because we always * use the local reference */ pfm_reset_regs(ctx, ctx->ctx_ovfl_regs, PFM_RELOAD_LONG_RESET); ctx->ctx_ovfl_regs[0] = 0UL; /* * Unlock sampling buffer and reset index atomically * XXX: not really needed when blocking */ if (CTX_HAS_SMPL(ctx)) { ctx->ctx_psb->psb_hdr->hdr_count = 0; ctx->ctx_psb->psb_index = 0; } ia64_set_pmc(0, 0); ia64_srlz_d(); /* state restored, can go back to work (user mode) */ } } /* * This function will record an entry in the sampling if it is not full already. * Return: * 0 : buffer is not full (did not BECOME full: still space or was already full) * 1 : buffer is full (recorded the last entry) */ static int pfm_record_sample(struct task_struct *task, pfm_context_t *ctx, unsigned long ovfl_mask, struct pt_regs *regs) { pfm_smpl_buffer_desc_t *psb = ctx->ctx_psb; unsigned long *e, m, idx; perfmon_smpl_entry_t *h; int j; pfm_recorded_samples_count++; idx = ia64_fetch_and_add(1, &psb->psb_index); DBprintk(("recording index=%ld entries=%ld\n", idx-1, psb->psb_entries)); /* * XXX: there is a small chance that we could run out on index before resetting * but index is unsigned long, so it will take some time..... * We use > instead of == because fetch_and_add() is off by one (see below) * * This case can happen in non-blocking mode or with multiple processes. * For non-blocking, we need to reload and continue. */ if (idx > psb->psb_entries) return 0; /* first entry is really entry 0, not 1 caused by fetch_and_add */ idx--; h = (perfmon_smpl_entry_t *)(((char *)psb->psb_addr) + idx*(psb->psb_entry_size)); /* * initialize entry header */ h->pid = task->pid; h->cpu = smp_processor_id(); h->rate = 0; /* XXX: add the sampling rate used here */ h->ip = regs ? regs->cr_iip : 0x0; /* where did the fault happened */ h->regs = ovfl_mask; /* which registers overflowed */ /* guaranteed to monotonically increase on each cpu */ h->stamp = pfm_get_stamp(); h->period = 0UL; /* not yet used */ /* position for first pmd */ e = (unsigned long *)(h+1); /* * selectively store PMDs in increasing index number */ m = ctx->ctx_smpl_regs[0]; for (j=0; m; m >>=1, j++) { if ((m & 0x1) == 0) continue; if (PMD_IS_COUNTING(j)) { *e = pfm_read_soft_counter(ctx, j); /* check if this pmd overflowed as well */ *e += ovfl_mask & (1UL<psb_hdr->hdr_count); DBprintk(("index=%ld entries=%ld hdr_count=%ld\n", idx, psb->psb_entries, psb->psb_hdr->hdr_count)); /* * sampling buffer full ? */ if (idx == (psb->psb_entries-1)) { DBprintk(("sampling buffer full\n")); /* * XXX: must reset buffer in blocking mode and lost notified */ return 1; } return 0; } /* * main overflow processing routine. * it can be called from the interrupt path or explicitely during the context switch code * Return: * new value of pmc[0]. if 0x0 then unfreeze, else keep frozen */ static unsigned long pfm_overflow_handler(struct task_struct *task, u64 pmc0, struct pt_regs *regs) { unsigned long mask; struct thread_struct *t; pfm_context_t *ctx; unsigned long old_val; unsigned long ovfl_notify = 0UL, ovfl_pmds = 0UL; int i; int my_cpu = smp_processor_id(); int ret = 1; struct siginfo si; /* * It is never safe to access the task for which the overflow interrupt is destinated * using the current variable as the interrupt may occur in the middle of a context switch * where current does not hold the task that is running yet. * * For monitoring, however, we do need to get access to the task which caused the overflow * to account for overflow on the counters. * * We accomplish this by maintaining a current owner of the PMU per CPU. During context * switch the ownership is changed in a way such that the reflected owner is always the * valid one, i.e. the one that caused the interrupt. */ if (task == NULL) { DBprintk(("owners[%d]=NULL\n", my_cpu)); return 0x1; } t = &task->thread; ctx = task->thread.pfm_context; if (!ctx) { printk("perfmon: Spurious overflow interrupt: process %d has no PFM context\n", task->pid); return 0; } /* * XXX: debug test * Don't think this could happen given upfront tests */ if ((t->flags & IA64_THREAD_PM_VALID) == 0 && ctx->ctx_fl_system == 0) { printk("perfmon: Spurious overflow interrupt: process %d not using perfmon\n", task->pid); return 0x1; } /* * sanity test. Should never happen */ if ((pmc0 & 0x1) == 0) { printk("perfmon: pid %d pmc0=0x%lx assumption error for freeze bit\n", task->pid, pmc0); return 0x0; } mask = pmc0 >> PMU_FIRST_COUNTER; DBprintk(("pmc0=0x%lx pid=%d iip=0x%lx, %s" " mode used_pmds=0x%lx save_pmcs=0x%lx reload_pmcs=0x%lx\n", pmc0, task->pid, (regs ? regs->cr_iip : 0), CTX_OVFL_NOBLOCK(ctx) ? "nonblocking" : "blocking", ctx->ctx_used_pmds[0], ctx->ctx_saved_pmcs[0], ctx->ctx_reload_pmcs[0])); /* * First we update the virtual counters */ for (i = PMU_FIRST_COUNTER; mask ; i++, mask >>= 1) { /* skip pmd which did not overflow */ if ((mask & 0x1) == 0) continue; DBprintk(("PMD[%d] overflowed hw_pmd=0x%lx soft_pmd=0x%lx\n", i, ia64_get_pmd(i), ctx->ctx_soft_pmds[i].val)); /* * Because we sometimes (EARS/BTB) reset to a specific value, we cannot simply use * val to count the number of times we overflowed. Otherwise we would loose the * current value in the PMD (which can be >0). So to make sure we don't loose * the residual counts we set val to contain full 64bits value of the counter. */ old_val = ctx->ctx_soft_pmds[i].val; ctx->ctx_soft_pmds[i].val = 1 + pmu_conf.perf_ovfl_val + pfm_read_soft_counter(ctx, i); DBprintk(("soft_pmd[%d].val=0x%lx old_val=0x%lx pmd=0x%lx\n", i, ctx->ctx_soft_pmds[i].val, old_val, ia64_get_pmd(i) & pmu_conf.perf_ovfl_val)); /* * now that we have extracted the hardware counter, we can clear it to ensure * that a subsequent PFM_READ_PMDS will not include it again. */ ia64_set_pmd(i, 0UL); /* * check for overflow condition */ if (old_val > ctx->ctx_soft_pmds[i].val) { ovfl_pmds |= 1UL << i; DBprintk(("soft_pmd[%d] overflowed flags=0x%x, ovfl=0x%lx\n", i, ctx->ctx_soft_pmds[i].flags, ovfl_pmds)); if (PMC_OVFL_NOTIFY(ctx, i)) { ovfl_notify |= 1UL << i; } } } /* * check for sampling buffer * * if present, record sample. We propagate notification ONLY when buffer * becomes full. */ if(CTX_HAS_SMPL(ctx)) { ret = pfm_record_sample(task, ctx, ovfl_pmds, regs); if (ret == 1) { /* * Sampling buffer became full * If no notication was requested, then we reset buffer index * and reset registers (done below) and resume. * If notification requested, then defer reset until pfm_restart() */ if (ovfl_notify == 0UL) { ctx->ctx_psb->psb_hdr->hdr_count = 0UL; ctx->ctx_psb->psb_index = 0UL; } } else { /* * sample recorded in buffer, no need to notify user */ ovfl_notify = 0UL; } } /* * No overflow requiring a user level notification */ if (ovfl_notify == 0UL) { pfm_reset_regs(ctx, &ovfl_pmds, PFM_RELOAD_SHORT_RESET); return 0x0; } /* * keep track of what to reset when unblocking */ ctx->ctx_ovfl_regs[0] = ovfl_pmds; /* * we have come to this point because there was an overflow and that notification * was requested. The notify_task may have disappeared, in which case notify_task * is NULL. */ if (ctx->ctx_notify_task) { si.si_errno = 0; si.si_addr = NULL; si.si_pid = task->pid; /* who is sending */ si.si_signo = SIGPROF; si.si_code = PROF_OVFL; /* indicates a perfmon SIGPROF signal */ /* * Shift the bitvector such that the user sees bit 4 for PMD4 and so on. * We only use smpl_ovfl[0] for now. It should be fine for quite a while * until we have more than 61 PMD available. */ si.si_pfm_ovfl[0] = ovfl_notify; /* * when the target of the signal is not ourself, we have to be more * careful. The notify_task may being cleared by the target task itself * in release_thread(). We must ensure mutual exclusion here such that * the signal is delivered (even to a dying task) safely. */ if (ctx->ctx_notify_task != current) { /* * grab the notification lock for this task * This guarantees that the sequence: test + send_signal * is atomic with regards to the ctx_notify_task field. * * We need a spinlock and not just an atomic variable for this. * */ spin_lock(&ctx->ctx_lock); /* * now notify_task cannot be modified until we're done * if NULL, they it got modified while we were in the handler */ if (ctx->ctx_notify_task == NULL) { spin_unlock(&ctx->ctx_lock); /* * If we've lost the notified task, then we will run * to completion wbut keep the PMU frozen. Results * will be incorrect anyway. We do not kill task * to leave it possible to attach perfmon context * to already running task. */ goto lost_notify; } /* * required by send_sig_info() to make sure the target * task does not disappear on us. */ read_lock(&tasklist_lock); } /* * in this case, we don't stop the task, we let it go on. It will * necessarily go to the signal handler (if any) when it goes back to * user mode. */ DBprintk(("[%d] sending notification to [%d]\n", task->pid, ctx->ctx_notify_task->pid)); /* * this call is safe in an interrupt handler, so does read_lock() on tasklist_lock */ ret = send_sig_info(SIGPROF, &si, ctx->ctx_notify_task); if (ret != 0) printk("send_sig_info(process %d, SIGPROF)=%d\n", ctx->ctx_notify_task->pid, ret); /* * now undo the protections in order */ if (ctx->ctx_notify_task != current) { read_unlock(&tasklist_lock); spin_unlock(&ctx->ctx_lock); } /* * if we block set the pfm_must_block bit * when in block mode, we can effectively block only when the notified * task is not self, otherwise we would deadlock. * in this configuration, the notification is sent, the task will not * block on the way back to user mode, but the PMU will be kept frozen * until PFM_RESTART. * Note that here there is still a race condition with notify_task * possibly being nullified behind our back, but this is fine because * it can only be changed to NULL which by construction, can only be * done when notify_task != current. So if it was already different * before, changing it to NULL will still maintain this invariant. * Of course, when it is equal to current it cannot change at this point. */ DBprintk(("block=%d notify [%d] current [%d]\n", ctx->ctx_fl_block, ctx->ctx_notify_task ? ctx->ctx_notify_task->pid: -1, current->pid )); if (!CTX_OVFL_NOBLOCK(ctx) && ctx->ctx_notify_task != task) { t->pfm_ovfl_block_reset = 1; /* will cause blocking */ } } else { lost_notify: /* XXX: more to do here, to convert to non-blocking (reset values) */ DBprintk(("notification task has disappeared !\n")); /* * for a non-blocking context, we make sure we do not fall into the * pfm_overflow_notify() trap. Also in the case of a blocking context with lost * notify process, then we do not want to block either (even though it is * interruptible). In this case, the PMU will be kept frozen and the process will * run to completion without monitoring enabled. * * Of course, we cannot loose notify process when self-monitoring. */ t->pfm_ovfl_block_reset = 0; } /* * If notification was successful, then we rely on the pfm_restart() * call to unfreeze and reset (in both blocking or non-blocking mode). * * If notification failed, then we will keep the PMU frozen and run * the task to completion */ ctx->ctx_fl_frozen = 1; DBprintk(("reload pmc0=0x%x must_block=%ld\n", ctx->ctx_fl_frozen ? 0x1 : 0x0, t->pfm_ovfl_block_reset)); return ctx->ctx_fl_frozen ? 0x1 : 0x0; } static void perfmon_interrupt (int irq, void *arg, struct pt_regs *regs) { u64 pmc0; struct task_struct *task; pfm_ovfl_intr_count++; /* * srlz.d done before arriving here * * This is slow */ pmc0 = ia64_get_pmc(0); /* * if we have some pending bits set * assumes : if any PM[0].bit[63-1] is set, then PMC[0].fr = 1 */ if ((pmc0 & ~0x1UL)!=0UL && (task=PMU_OWNER())!= NULL) { /* * assumes, PMC[0].fr = 1 at this point * * XXX: change protype to pass &pmc0 */ pmc0 = pfm_overflow_handler(task, pmc0, regs); /* we never explicitely freeze PMU here */ if (pmc0 == 0) { ia64_set_pmc(0, 0); ia64_srlz_d(); } } else { pfm_spurious_ovfl_intr_count++; DBprintk(("perfmon: Spurious PMU overflow interrupt on CPU%d: pmc0=0x%lx owner=%p\n", smp_processor_id(), pmc0, (void *)PMU_OWNER())); } } /* for debug only */ static int perfmon_proc_info(char *page) { #ifdef CONFIG_SMP #define cpu_is_online(i) (cpu_online_map & (1UL << i)) #else #define cpu_is_online(i) 1 #endif char *p = page; u64 pmc0 = ia64_get_pmc(0); int i; p += sprintf(p, "perfmon enabled: %s\n", pmu_conf.pfm_is_disabled ? "No": "Yes"); p += sprintf(p, "monitors_pmcs0]=0x%lx\n", pmu_conf.monitor_pmcs[0]); p += sprintf(p, "counter_pmcds[0]=0x%lx\n", pmu_conf.counter_pmds[0]); p += sprintf(p, "overflow interrupts=%lu\n", pfm_ovfl_intr_count); p += sprintf(p, "spurious overflow interrupts=%lu\n", pfm_spurious_ovfl_intr_count); p += sprintf(p, "recorded samples=%lu\n", pfm_recorded_samples_count); p += sprintf(p, "CPU%d.pmc[0]=%lx\nPerfmon debug: %s\n", smp_processor_id(), pmc0, pfm_debug_mode ? "On" : "Off"); #ifdef CONFIG_SMP p += sprintf(p, "CPU%d cpu_data.pfm_syst_wide=%d cpu_data.dcr_pp=%d\n", smp_processor_id(), local_cpu_data->pfm_syst_wide, local_cpu_data->pfm_dcr_pp); #endif LOCK_PFS(); p += sprintf(p, "proc_sessions=%lu\nsys_sessions=%lu\nsys_use_dbregs=%lu\nptrace_use_dbregs=%lu\n", pfm_sessions.pfs_task_sessions, pfm_sessions.pfs_sys_sessions, pfm_sessions.pfs_sys_use_dbregs, pfm_sessions.pfs_ptrace_use_dbregs); UNLOCK_PFS(); for(i=0; i < NR_CPUS; i++) { if (cpu_is_online(i)) { p += sprintf(p, "CPU%d.pmu_owner: %-6d\n", i, pmu_owners[i].owner ? pmu_owners[i].owner->pid: -1); } } return p - page; } /* /proc interface, for debug only */ static int perfmon_read_entry(char *page, char **start, off_t off, int count, int *eof, void *data) { int len = perfmon_proc_info(page); if (len <= off+count) *eof = 1; *start = page + off; len -= off; if (len>count) len = count; if (len<0) len = 0; return len; } #ifdef CONFIG_SMP void pfm_syst_wide_update_task(struct task_struct *task, int mode) { struct pt_regs *regs = (struct pt_regs *)((unsigned long) task + IA64_STK_OFFSET); regs--; /* * propagate the value of the dcr_pp bit to the psr */ ia64_psr(regs)->pp = mode ? local_cpu_data->pfm_dcr_pp : 0; } #endif void pfm_save_regs (struct task_struct *task) { pfm_context_t *ctx; u64 psr; ctx = task->thread.pfm_context; /* * save current PSR: needed because we modify it */ __asm__ __volatile__ ("mov %0=psr;;": "=r"(psr) :: "memory"); /* * stop monitoring: * This is the last instruction which can generate an overflow * * We do not need to set psr.sp because, it is irrelevant in kernel. * It will be restored from ipsr when going back to user level */ __asm__ __volatile__ ("rum psr.up;;"::: "memory"); ctx->ctx_saved_psr = psr; //ctx->ctx_last_cpu = smp_processor_id(); } static void pfm_lazy_save_regs (struct task_struct *task) { pfm_context_t *ctx; struct thread_struct *t; unsigned long mask; int i; DBprintk(("on [%d] by [%d]\n", task->pid, current->pid)); t = &task->thread; ctx = task->thread.pfm_context; #ifdef CONFIG_SMP /* * announce we are saving this PMU state * This will cause other CPU, to wait until we're done * before using the context.h * * must be an atomic operation */ atomic_set(&ctx->ctx_saving_in_progress, 1); /* * if owner is NULL, it means that the other CPU won the race * and the IPI has caused the context to be saved in pfm_handle_fectch_regs() * instead of here. We have nothing to do * * note that this is safe, because the other CPU NEVER modifies saving_in_progress. */ if (PMU_OWNER() == NULL) goto do_nothing; #endif /* * do not own the PMU */ SET_PMU_OWNER(NULL); ia64_srlz_d(); /* * XXX needs further optimization. * Also must take holes into account */ mask = ctx->ctx_used_pmds[0]; for (i=0; mask; i++, mask>>=1) { if (mask & 0x1) t->pmd[i] =ia64_get_pmd(i); } /* * XXX: simplify to pmc0 only */ mask = ctx->ctx_saved_pmcs[0]; for (i=0; mask; i++, mask>>=1) { if (mask & 0x1) t->pmc[i] = ia64_get_pmc(i); } /* not owned by this CPU */ atomic_set(&ctx->ctx_last_cpu, -1); #ifdef CONFIG_SMP do_nothing: #endif /* * declare we are done saving this context * * must be an atomic operation */ atomic_set(&ctx->ctx_saving_in_progress,0); } #ifdef CONFIG_SMP /* * Handles request coming from other CPUs */ static void pfm_handle_fetch_regs(void *info) { pfm_smp_ipi_arg_t *arg = info; struct thread_struct *t; pfm_context_t *ctx; unsigned long mask; int i; ctx = arg->task->thread.pfm_context; t = &arg->task->thread; DBprintk(("task=%d owner=%d saving=%d\n", arg->task->pid, PMU_OWNER() ? PMU_OWNER()->pid: -1, atomic_read(&ctx->ctx_saving_in_progress))); /* must wait if saving was interrupted */ if (atomic_read(&ctx->ctx_saving_in_progress)) { arg->retval = 1; return; } /* can proceed, done with context */ if (PMU_OWNER() != arg->task) { arg->retval = 0; return; } DBprintk(("saving state for [%d] save_pmcs=0x%lx all_pmcs=0x%lx used_pmds=0x%lx\n", arg->task->pid, ctx->ctx_saved_pmcs[0], ctx->ctx_reload_pmcs[0], ctx->ctx_used_pmds[0])); /* * XXX: will be replaced with pure assembly call */ SET_PMU_OWNER(NULL); ia64_srlz_d(); /* * XXX needs further optimization. * Also must take holes into account */ mask = ctx->ctx_used_pmds[0]; for (i=0; mask; i++, mask>>=1) { if (mask & 0x1) t->pmd[i] =ia64_get_pmd(i); } mask = ctx->ctx_saved_pmcs[0]; for (i=0; mask; i++, mask>>=1) { if (mask & 0x1) t->pmc[i] = ia64_get_pmc(i); } /* not owned by this CPU */ atomic_set(&ctx->ctx_last_cpu, -1); /* can proceed */ arg->retval = 0; } /* * Function call to fetch PMU state from another CPU identified by 'cpu'. * If the context is being saved on the remote CPU, then we busy wait until * the saving is done and then we return. In this case, non IPI is sent. * Otherwise, we send an IPI to the remote CPU, potentially interrupting * pfm_lazy_save_regs() over there. * * If the retval==1, then it means that we interrupted remote save and that we must * wait until the saving is over before proceeding. * Otherwise, we did the saving on the remote CPU, and it was done by the time we got there. * in either case, we can proceed. */ static void pfm_fetch_regs(int cpu, struct task_struct *task, pfm_context_t *ctx) { pfm_smp_ipi_arg_t arg; int ret; arg.task = task; arg.retval = -1; if (atomic_read(&ctx->ctx_saving_in_progress)) { DBprintk(("no IPI, must wait for [%d] to be saved on [%d]\n", task->pid, cpu)); /* busy wait */ while (atomic_read(&ctx->ctx_saving_in_progress)); return; } DBprintk(("calling CPU %d from CPU %d\n", cpu, smp_processor_id())); if (cpu == -1) { printk("refusing to use -1 for [%d]\n", task->pid); return; } /* will send IPI to other CPU and wait for completion of remote call */ if ((ret=smp_call_function_single(cpu, pfm_handle_fetch_regs, &arg, 0, 1))) { printk("perfmon: remote CPU call from %d to %d error %d\n", smp_processor_id(), cpu, ret); return; } /* * we must wait until saving is over on the other CPU * This is the case, where we interrupted the saving which started just at the time we sent the * IPI. */ if (arg.retval == 1) { DBprintk(("must wait for [%d] to be saved on [%d]\n", task->pid, cpu)); while (atomic_read(&ctx->ctx_saving_in_progress)); DBprintk(("done saving for [%d] on [%d]\n", task->pid, cpu)); } } #endif /* CONFIG_SMP */ void pfm_load_regs (struct task_struct *task) { struct thread_struct *t; pfm_context_t *ctx; struct task_struct *owner; unsigned long mask; u64 psr; int i; #ifdef CONFIG_SMP int cpu; #endif owner = PMU_OWNER(); ctx = task->thread.pfm_context; /* * if we were the last user, then nothing to do except restore psr */ if (owner == task) { if (atomic_read(&ctx->ctx_last_cpu) != smp_processor_id()) DBprintk(("invalid last_cpu=%d for [%d]\n", atomic_read(&ctx->ctx_last_cpu), task->pid)); psr = ctx->ctx_saved_psr; __asm__ __volatile__ ("mov psr.l=%0;; srlz.i;;"::"r"(psr): "memory"); return; } DBprintk(("load_regs: must reload for [%d] owner=%d\n", task->pid, owner ? owner->pid : -1 )); /* * someone else is still using the PMU, first push it out and * then we'll be able to install our stuff ! */ if (owner) pfm_lazy_save_regs(owner); #ifdef CONFIG_SMP /* * check if context on another CPU (-1 means saved) * We MUST use the variable, as last_cpu may change behind our * back. If it changes to -1 (not on a CPU anymore), then in cpu * we have the last CPU the context was on. We may be sending the * IPI for nothing, but we have no way of verifying this. */ cpu = atomic_read(&ctx->ctx_last_cpu); if (cpu != -1) { pfm_fetch_regs(cpu, task, ctx); } #endif t = &task->thread; /* * XXX: will be replaced by assembly routine * We clear all unused PMDs to avoid leaking information */ mask = ctx->ctx_used_pmds[0]; for (i=0; mask; i++, mask>>=1) { if (mask & 0x1) ia64_set_pmd(i, t->pmd[i]); else ia64_set_pmd(i, 0UL); } /* XXX: will need to clear all unused pmd, for security */ /* * skip pmc[0] to avoid side-effects, * all PMCs are systematically reloaded, unsued get default value * to avoid picking up stale configuration */ mask = ctx->ctx_reload_pmcs[0]>>1; for (i=1; mask; i++, mask>>=1) { if (mask & 0x1) ia64_set_pmc(i, t->pmc[i]); } /* * restore debug registers when used for range restrictions. * We must restore the unused registers to avoid picking up * stale information. */ mask = ctx->ctx_used_ibrs[0]; for (i=0; mask; i++, mask>>=1) { if (mask & 0x1) ia64_set_ibr(i, t->ibr[i]); else ia64_set_ibr(i, 0UL); } mask = ctx->ctx_used_dbrs[0]; for (i=0; mask; i++, mask>>=1) { if (mask & 0x1) ia64_set_dbr(i, t->dbr[i]); else ia64_set_dbr(i, 0UL); } if (t->pmc[0] & ~0x1) { ia64_srlz_d(); pfm_overflow_handler(task, t->pmc[0], NULL); } /* * fl_frozen==1 when we are in blocking mode waiting for restart */ if (ctx->ctx_fl_frozen == 0) { ia64_set_pmc(0, 0); ia64_srlz_d(); } atomic_set(&ctx->ctx_last_cpu, smp_processor_id()); SET_PMU_OWNER(task); /* * restore the psr we changed in pfm_save_regs() */ psr = ctx->ctx_saved_psr; __asm__ __volatile__ ("mov psr.l=%0;; srlz.i;;"::"r"(psr): "memory"); } /* * XXX: make this routine able to work with non current context */ static void ia64_reset_pmu(struct task_struct *task) { struct thread_struct *t = &task->thread; pfm_context_t *ctx = t->pfm_context; unsigned long mask; int i; if (task != current) { printk("perfmon: invalid task in ia64_reset_pmu()\n"); return; } /* Let's make sure the PMU is frozen */ ia64_set_pmc(0,1); /* * install reset values for PMC. We skip PMC0 (done above) * XX: good up to 64 PMCS */ mask = pmu_conf.impl_regs[0] >> 1; for(i=1; mask; mask>>=1, i++) { if (mask & 0x1) { ia64_set_pmc(i, reset_pmcs[i]); /* * When restoring context, we must restore ALL pmcs, even the ones * that the task does not use to avoid leaks and possibly corruption * of the sesion because of configuration conflicts. So here, we * initializaed the table used in the context switch restore routine. */ t->pmc[i] = reset_pmcs[i]; DBprintk((" pmc[%d]=0x%lx\n", i, reset_pmcs[i])); } } /* * clear reset values for PMD. * XX: good up to 64 PMDS. Suppose that zero is a valid value. */ mask = pmu_conf.impl_regs[4]; for(i=0; mask; mask>>=1, i++) { if (mask & 0x1) ia64_set_pmd(i, 0UL); } /* * On context switched restore, we must restore ALL pmc even * when they are not actively used by the task. In UP, the incoming process * may otherwise pick up left over PMC state from the previous process. * As opposed to PMD, stale PMC can cause harm to the incoming * process because they may change what is being measured. * Therefore, we must systematically reinstall the entire * PMC state. In SMP, the same thing is possible on the * same CPU but also on between 2 CPUs. * * There is unfortunately no easy way to avoid this problem * on either UP or SMP. This definitively slows down the * pfm_load_regs(). */ /* * We must include all the PMC in this mask to make sure we don't * see any side effect of the stale state, such as opcode matching * or range restrictions, for instance. */ ctx->ctx_reload_pmcs[0] = pmu_conf.impl_regs[0]; /* * useful in case of re-enable after disable */ ctx->ctx_used_pmds[0] = 0UL; ctx->ctx_used_ibrs[0] = 0UL; ctx->ctx_used_dbrs[0] = 0UL; ia64_srlz_d(); } /* * This function is called when a thread exits (from exit_thread()). * This is a simplified pfm_save_regs() that simply flushes the current * register state into the save area taking into account any pending * overflow. This time no notification is sent because the task is dying * anyway. The inline processing of overflows avoids loosing some counts. * The PMU is frozen on exit from this call and is to never be reenabled * again for this task. * */ void pfm_flush_regs (struct task_struct *task) { pfm_context_t *ctx; u64 pmc0; unsigned long mask, mask2, val; int i; ctx = task->thread.pfm_context; if (ctx == NULL) return; /* * that's it if context already disabled */ if (ctx->ctx_flags.state == PFM_CTX_DISABLED) return; /* * stop monitoring: * This is the only way to stop monitoring without destroying overflow * information in PMC[0]. * This is the last instruction which can cause overflow when monitoring * in kernel. * By now, we could still have an overflow interrupt in-flight. */ if (ctx->ctx_fl_system) { __asm__ __volatile__ ("rsm psr.pp;;"::: "memory"); /* disable dcr pp */ ia64_set_dcr(ia64_get_dcr() & ~IA64_DCR_PP); #ifdef CONFIG_SMP local_cpu_data->pfm_syst_wide = 0; local_cpu_data->pfm_dcr_pp = 0; #else pfm_tasklist_toggle_pp(0); #endif } else { __asm__ __volatile__ ("rum psr.up;;"::: "memory"); /* no more save/restore on ctxsw */ current->thread.flags &= ~IA64_THREAD_PM_VALID; } /* * Mark the PMU as not owned * This will cause the interrupt handler to do nothing in case an overflow * interrupt was in-flight * This also guarantees that pmc0 will contain the final state * It virtually gives us full control on overflow processing from that point * on. * It must be an atomic operation. */ SET_PMU_OWNER(NULL); /* * read current overflow status: * * we are guaranteed to read the final stable state */ ia64_srlz_d(); pmc0 = ia64_get_pmc(0); /* slow */ /* * freeze PMU: * * This destroys the overflow information. This is required to make sure * next process does not start with monitoring on if not requested */ ia64_set_pmc(0, 1); ia64_srlz_d(); /* * We don't need to restore psr, because we are on our way out anyway */ /* * This loop flushes the PMD into the PFM context. * It also processes overflow inline. * * IMPORTANT: No notification is sent at this point as the process is dying. * The implicit notification will come from a SIGCHILD or a return from a * waitpid(). * */ if (atomic_read(&ctx->ctx_last_cpu) != smp_processor_id()) printk("perfmon: [%d] last_cpu=%d\n", task->pid, atomic_read(&ctx->ctx_last_cpu)); mask = pmc0 >> PMU_FIRST_COUNTER; mask2 = ctx->ctx_used_pmds[0] >> PMU_FIRST_COUNTER; for (i = PMU_FIRST_COUNTER; mask2; i++, mask>>=1, mask2>>=1) { /* skip non used pmds */ if ((mask2 & 0x1) == 0) continue; val = ia64_get_pmd(i); if (PMD_IS_COUNTING(i)) { DBprintk(("[%d] pmd[%d] soft_pmd=0x%lx hw_pmd=0x%lx\n", task->pid, i, ctx->ctx_soft_pmds[i].val, val & pmu_conf.perf_ovfl_val)); /* collect latest results */ ctx->ctx_soft_pmds[i].val += val & pmu_conf.perf_ovfl_val; /* * now everything is in ctx_soft_pmds[] and we need * to clear the saved context from save_regs() such that * pfm_read_pmds() gets the correct value */ task->thread.pmd[i] = 0; /* take care of overflow inline */ if (mask & 0x1) { ctx->ctx_soft_pmds[i].val += 1 + pmu_conf.perf_ovfl_val; DBprintk(("[%d] pmd[%d] overflowed soft_pmd=0x%lx\n", task->pid, i, ctx->ctx_soft_pmds[i].val)); } } else { DBprintk(("[%d] pmd[%d] hw_pmd=0x%lx\n", task->pid, i, val)); /* not a counter, just save value as is */ task->thread.pmd[i] = val; } } /* * indicates that context has been saved */ atomic_set(&ctx->ctx_last_cpu, -1); } /* * task is the newly created task, pt_regs for new child */ int pfm_inherit(struct task_struct *task, struct pt_regs *regs) { pfm_context_t *ctx = current->thread.pfm_context; pfm_context_t *nctx; struct thread_struct *th = &task->thread; unsigned long m; int i; /* * make sure child cannot mess up the monitoring session */ ia64_psr(regs)->sp = 1; DBprintk(("enabling psr.sp for [%d]\n", task->pid)); /* * remove any sampling buffer mapping from child user * address space. Must be done for all cases of inheritance. */ if (ctx->ctx_smpl_vaddr) pfm_remove_smpl_mapping(task); /* * takes care of easiest case first */ if (CTX_INHERIT_MODE(ctx) == PFM_FL_INHERIT_NONE) { DBprintk(("removing PFM context for [%d]\n", task->pid)); task->thread.pfm_context = NULL; task->thread.pfm_ovfl_block_reset = 0; /* copy_thread() clears IA64_THREAD_PM_VALID */ return 0; } nctx = pfm_context_alloc(); if (nctx == NULL) return -ENOMEM; /* copy content */ *nctx = *ctx; if (CTX_INHERIT_MODE(ctx) == PFM_FL_INHERIT_ONCE) { nctx->ctx_fl_inherit = PFM_FL_INHERIT_NONE; atomic_set(&nctx->ctx_last_cpu, -1); /* * task is not yet visible in the tasklist, so we do * not need to lock the newly created context. * However, we must grab the tasklist_lock to ensure * that the ctx_owner or ctx_notify_task do not disappear * while we increment their check counters. */ read_lock(&tasklist_lock); if (nctx->ctx_notify_task) atomic_inc(&nctx->ctx_notify_task->thread.pfm_notifiers_check); if (nctx->ctx_owner) atomic_inc(&nctx->ctx_owner->thread.pfm_owners_check); read_unlock(&tasklist_lock); DBprintk(("downgrading to INHERIT_NONE for [%d]\n", task->pid)); LOCK_PFS(); pfm_sessions.pfs_task_sessions++; UNLOCK_PFS(); } /* initialize counters in new context */ m = pmu_conf.counter_pmds[0] >> PMU_FIRST_COUNTER; for(i = PMU_FIRST_COUNTER ; m ; m>>=1, i++) { if (m & 0x1) { nctx->ctx_soft_pmds[i].val = nctx->ctx_soft_pmds[i].ival & ~pmu_conf.perf_ovfl_val; th->pmd[i] = nctx->ctx_soft_pmds[i].ival & pmu_conf.perf_ovfl_val; } } /* clear BTB index register */ th->pmd[16] = 0; /* if sampling then increment number of users of buffer */ if (nctx->ctx_psb) { /* * XXX: nopt very pretty! */ LOCK_PSB(nctx->ctx_psb); nctx->ctx_psb->psb_refcnt++; UNLOCK_PSB(nctx->ctx_psb); /* * remove any pointer to sampling buffer mapping */ nctx->ctx_smpl_vaddr = 0; } nctx->ctx_fl_frozen = 0; nctx->ctx_ovfl_regs[0] = 0UL; sema_init(&nctx->ctx_restart_sem, 0); /* reset this semaphore to locked */ /* clear pending notification */ th->pfm_ovfl_block_reset = 0; /* link with new task */ th->pfm_context = nctx; DBprintk(("nctx=%p for process [%d]\n", (void *)nctx, task->pid)); /* * the copy_thread routine automatically clears * IA64_THREAD_PM_VALID, so we need to reenable it, if it was used by the caller */ if (current->thread.flags & IA64_THREAD_PM_VALID) { DBprintk(("setting PM_VALID for [%d]\n", task->pid)); th->flags |= IA64_THREAD_PM_VALID; } return 0; } /* * * We cannot touch any of the PMU registers at this point as we may * not be running on the same CPU the task was last run on. Therefore * it is assumed that the PMU has been stopped appropriately in * pfm_flush_regs() called from exit_thread(). * * The function is called in the context of the parent via a release_thread() * and wait4(). The task is not in the tasklist anymore. */ void pfm_context_exit(struct task_struct *task) { pfm_context_t *ctx = task->thread.pfm_context; /* * check sampling buffer */ if (ctx->ctx_psb) { pfm_smpl_buffer_desc_t *psb = ctx->ctx_psb; LOCK_PSB(psb); DBprintk(("sampling buffer from [%d] @%p size %ld vma_flag=0x%x\n", task->pid, psb->psb_hdr, psb->psb_size, psb->psb_flags)); /* * in the case where we are the last user, we may be able to free * the buffer */ psb->psb_refcnt--; if (psb->psb_refcnt == 0) { /* * The flag is cleared in pfm_vm_close(). which gets * called from do_exit() via exit_mm(). * By the time we come here, the task has no more mm context. * * We can only free the psb and buffer here after the vm area * describing the buffer has been removed. This normally happens * as part of do_exit() but the entire mm context is ONLY removed * once its reference counts goes to zero. This is typically * the case except for multi-threaded (several tasks) processes. * * See pfm_vm_close() and pfm_cleanup_smpl_buf() for more details. */ if ((psb->psb_flags & PFM_PSB_VMA) == 0) { DBprintk(("cleaning sampling buffer from [%d] @%p size %ld\n", task->pid, psb->psb_hdr, psb->psb_size)); /* * free the buffer and psb */ pfm_rvfree(psb->psb_hdr, psb->psb_size); kfree(psb); psb = NULL; } } /* psb may have been deleted */ if (psb) UNLOCK_PSB(psb); } DBprintk(("cleaning [%d] pfm_context @%p notify_task=%p check=%d mm=%p\n", task->pid, ctx, ctx->ctx_notify_task, atomic_read(&task->thread.pfm_notifiers_check), task->mm)); /* * To avoid getting the notified task or owner task scan the entire process * list when they exit, we decrement notifiers_check and owners_check respectively. * * Of course, there is race condition between decreasing the value and the * task exiting. The danger comes from the fact that, in both cases, we have a * direct pointer to a task structure thereby bypassing the tasklist. * We must make sure that, if we have task!= NULL, the target task is still * present and is identical to the initial task specified * during pfm_create_context(). It may already be detached from the tasklist but * that's okay. Note that it is okay if we miss the deadline and the task scans * the list for nothing, it will affect performance but not correctness. * The correctness is ensured by using the ctx_lock which prevents the * notify_task from changing the fields in our context. * Once holdhing this lock, if we see task!= NULL, then it will stay like * that until we release the lock. If it is NULL already then we came too late. */ LOCK_CTX(ctx); if (ctx->ctx_notify_task != NULL) { DBprintk(("[%d], [%d] atomic_sub on [%d] notifiers=%u\n", current->pid, task->pid, ctx->ctx_notify_task->pid, atomic_read(&ctx->ctx_notify_task->thread.pfm_notifiers_check))); atomic_dec(&ctx->ctx_notify_task->thread.pfm_notifiers_check); } if (ctx->ctx_owner != NULL) { DBprintk(("[%d], [%d] atomic_sub on [%d] owners=%u\n", current->pid, task->pid, ctx->ctx_owner->pid, atomic_read(&ctx->ctx_owner->thread.pfm_owners_check))); atomic_dec(&ctx->ctx_owner->thread.pfm_owners_check); } UNLOCK_CTX(ctx); LOCK_PFS(); if (ctx->ctx_fl_system) { pfm_sessions.pfs_sys_session[ctx->ctx_cpu] = NULL; pfm_sessions.pfs_sys_sessions--; DBprintk(("freeing syswide session on CPU%ld\n", ctx->ctx_cpu)); /* update perfmon debug register counter */ if (ctx->ctx_fl_using_dbreg) { if (pfm_sessions.pfs_sys_use_dbregs == 0) { printk("perfmon: invalid release for [%d] sys_use_dbregs=0\n", task->pid); } else pfm_sessions.pfs_sys_use_dbregs--; } /* * remove any CPU pinning */ task->cpus_allowed = ctx->ctx_saved_cpus_allowed; task->need_resched = 1; } else { pfm_sessions.pfs_task_sessions--; } UNLOCK_PFS(); pfm_context_free(ctx); /* * clean pfm state in thread structure, */ task->thread.pfm_context = NULL; task->thread.pfm_ovfl_block_reset = 0; /* pfm_notifiers is cleaned in pfm_cleanup_notifiers() */ } /* * function invoked from release_thread when pfm_smpl_buf_list is not NULL */ int pfm_cleanup_smpl_buf(struct task_struct *task) { pfm_smpl_buffer_desc_t *tmp, *psb = task->thread.pfm_smpl_buf_list; if (psb == NULL) { printk("perfmon: psb is null in [%d]\n", current->pid); return -1; } /* * Walk through the list and free the sampling buffer and psb */ while (psb) { DBprintk(("[%d] freeing smpl @%p size %ld\n", current->pid, psb->psb_hdr, psb->psb_size)); pfm_rvfree(psb->psb_hdr, psb->psb_size); tmp = psb->psb_next; kfree(psb); psb = tmp; } /* just in case */ task->thread.pfm_smpl_buf_list = NULL; return 0; } /* * function invoked from release_thread to make sure that the ctx_owner field does not * point to an unexisting task. */ void pfm_cleanup_owners(struct task_struct *task) { struct task_struct *p; pfm_context_t *ctx; DBprintk(("called by [%d] for [%d]\n", current->pid, task->pid)); read_lock(&tasklist_lock); for_each_task(p) { /* * It is safe to do the 2-step test here, because thread.ctx * is cleaned up only in release_thread() and at that point * the task has been detached from the tasklist which is an * operation which uses the write_lock() on the tasklist_lock * so it cannot run concurrently to this loop. So we have the * guarantee that if we find p and it has a perfmon ctx then * it is going to stay like this for the entire execution of this * loop. */ ctx = p->thread.pfm_context; //DBprintk(("[%d] scanning task [%d] ctx=%p\n", task->pid, p->pid, ctx)); if (ctx && ctx->ctx_owner == task) { DBprintk(("trying for owner [%d] in [%d]\n", task->pid, p->pid)); /* * the spinlock is required to take care of a race condition * with the send_sig_info() call. We must make sure that * either the send_sig_info() completes using a valid task, * or the notify_task is cleared before the send_sig_info() * can pick up a stale value. Note that by the time this * function is executed the 'task' is already detached from the * tasklist. The problem is that the notifiers have a direct * pointer to it. It is okay to send a signal to a task in this * stage, it simply will have no effect. But it is better than sending * to a completely destroyed task or worse to a new task using the same * task_struct address. */ LOCK_CTX(ctx); ctx->ctx_owner = NULL; UNLOCK_CTX(ctx); DBprintk(("done for notifier [%d] in [%d]\n", task->pid, p->pid)); } } read_unlock(&tasklist_lock); } /* * function called from release_thread to make sure that the ctx_notify_task is not pointing * to an unexisting task */ void pfm_cleanup_notifiers(struct task_struct *task) { struct task_struct *p; pfm_context_t *ctx; DBprintk(("called by [%d] for [%d]\n", current->pid, task->pid)); read_lock(&tasklist_lock); for_each_task(p) { /* * It is safe to do the 2-step test here, because thread.ctx * is cleaned up only in release_thread() and at that point * the task has been detached from the tasklist which is an * operation which uses the write_lock() on the tasklist_lock * so it cannot run concurrently to this loop. So we have the * guarantee that if we find p and it has a perfmon ctx then * it is going to stay like this for the entire execution of this * loop. */ ctx = p->thread.pfm_context; //DBprintk(("[%d] scanning task [%d] ctx=%p\n", task->pid, p->pid, ctx)); if (ctx && ctx->ctx_notify_task == task) { DBprintk(("trying for notifier [%d] in [%d]\n", task->pid, p->pid)); /* * the spinlock is required to take care of a race condition * with the send_sig_info() call. We must make sure that * either the send_sig_info() completes using a valid task, * or the notify_task is cleared before the send_sig_info() * can pick up a stale value. Note that by the time this * function is executed the 'task' is already detached from the * tasklist. The problem is that the notifiers have a direct * pointer to it. It is okay to send a signal to a task in this * stage, it simply will have no effect. But it is better than sending * to a completely destroyed task or worse to a new task using the same * task_struct address. */ LOCK_CTX(ctx); ctx->ctx_notify_task = NULL; UNLOCK_CTX(ctx); DBprintk(("done for notifier [%d] in [%d]\n", task->pid, p->pid)); } } read_unlock(&tasklist_lock); } static struct irqaction perfmon_irqaction = { handler: perfmon_interrupt, flags: SA_INTERRUPT, name: "perfmon" }; static void pfm_pmu_snapshot(void) { int i; for (i=0; i < IA64_NUM_PMC_REGS; i++) { if (i >= pmu_conf.num_pmcs) break; if (PMC_IS_IMPL(i)) reset_pmcs[i] = ia64_get_pmc(i); } } /* * perfmon initialization routine, called from the initcall() table */ int __init perfmon_init (void) { pal_perf_mon_info_u_t pm_info; s64 status; register_percpu_irq(IA64_PERFMON_VECTOR, &perfmon_irqaction); ia64_set_pmv(IA64_PERFMON_VECTOR); ia64_srlz_d(); pmu_conf.pfm_is_disabled = 1; printk("perfmon: version %u.%u (sampling format v%u.%u) IRQ %u\n", PFM_VERSION_MAJ, PFM_VERSION_MIN, PFM_SMPL_VERSION_MAJ, PFM_SMPL_VERSION_MIN, IA64_PERFMON_VECTOR); if ((status=ia64_pal_perf_mon_info(pmu_conf.impl_regs, &pm_info)) != 0) { printk("perfmon: PAL call failed (%ld), perfmon disabled\n", status); return -1; } pmu_conf.perf_ovfl_val = (1UL << pm_info.pal_perf_mon_info_s.width) - 1; pmu_conf.max_counters = pm_info.pal_perf_mon_info_s.generic; pmu_conf.num_pmcs = find_num_pm_regs(pmu_conf.impl_regs); pmu_conf.num_pmds = find_num_pm_regs(&pmu_conf.impl_regs[4]); printk("perfmon: %u bits counters\n", pm_info.pal_perf_mon_info_s.width); printk("perfmon: %lu PMC/PMD pairs, %lu PMCs, %lu PMDs\n", pmu_conf.max_counters, pmu_conf.num_pmcs, pmu_conf.num_pmds); /* sanity check */ if (pmu_conf.num_pmds >= IA64_NUM_PMD_REGS || pmu_conf.num_pmcs >= IA64_NUM_PMC_REGS) { printk(KERN_ERR "perfmon: not enough pmc/pmd, perfmon is DISABLED\n"); return -1; /* no need to continue anyway */ } if (ia64_pal_debug_info(&pmu_conf.num_ibrs, &pmu_conf.num_dbrs)) { printk(KERN_WARNING "perfmon: unable to get number of debug registers\n"); pmu_conf.num_ibrs = pmu_conf.num_dbrs = 0; } /* PAL reports the number of pairs */ pmu_conf.num_ibrs <<=1; pmu_conf.num_dbrs <<=1; /* * take a snapshot of all PMU registers. PAL is supposed * to configure them with stable/safe values, i.e., not * capturing anything. * We take a snapshot now, before we make any modifications. This * will become our master copy. Then we will reuse the snapshot * to reset the PMU in pfm_enable(). Using this technique, perfmon * does NOT have to know about the specific values to program for * the PMC/PMD. The safe values may be different from one CPU model to * the other. */ pfm_pmu_snapshot(); /* * list the pmc registers used to control monitors * XXX: unfortunately this information is not provided by PAL * * We start with the architected minimum and then refine for each CPU model */ pmu_conf.monitor_pmcs[0] = PMM(4)|PMM(5)|PMM(6)|PMM(7); /* * architected counters */ pmu_conf.counter_pmds[0] |= PMM(4)|PMM(5)|PMM(6)|PMM(7); #ifdef CONFIG_ITANIUM pmu_conf.monitor_pmcs[0] |= PMM(10)|PMM(11)|PMM(12); /* Itanium does not add more counters */ #endif /* we are all set */ pmu_conf.pfm_is_disabled = 0; /* * for now here for debug purposes */ perfmon_dir = create_proc_read_entry ("perfmon", 0, 0, perfmon_read_entry, NULL); spin_lock_init(&pfm_sessions.pfs_lock); return 0; } __initcall(perfmon_init); void perfmon_init_percpu (void) { ia64_set_pmv(IA64_PERFMON_VECTOR); ia64_srlz_d(); } #else /* !CONFIG_PERFMON */ asmlinkage int sys_perfmonctl (int pid, int cmd, void *req, int count, long arg5, long arg6, long arg7, long arg8, long stack) { return -ENOSYS; } #endif /* !CONFIG_PERFMON */