[ppc64] shared processor support, from Dave Engebretsen Initial round of code to add shared processor support into 2.6. This adds h_call interfaces, paca/VPA fields, and vpa register. Add adds new idle loop code. --- arch/ppc64/kernel/chrp_setup.c | 2 arch/ppc64/kernel/idle.c | 251 ++++++++++++++++++++++++++++------- arch/ppc64/kernel/pSeries_lpar.c | 25 +++ arch/ppc64/kernel/setup.c | 8 + arch/ppc64/kernel/smp.c | 16 ++ include/asm-ppc64/hvcall.h | 18 ++ include/asm-ppc64/iSeries/ItLpPaca.h | 5 include/asm-ppc64/memory.h | 25 ++- include/asm-ppc64/naca.h | 7 include/asm-ppc64/paca.h | 12 + include/asm-ppc64/processor.h | 1 11 files changed, 306 insertions(+), 64 deletions(-) diff -puN arch/ppc64/kernel/pSeries_lpar.c~ppc64-sharedproc arch/ppc64/kernel/pSeries_lpar.c --- 25/arch/ppc64/kernel/pSeries_lpar.c~ppc64-sharedproc 2004-01-13 23:22:44.000000000 -0800 +++ 25-akpm/arch/ppc64/kernel/pSeries_lpar.c 2004-01-13 23:22:44.000000000 -0800 @@ -37,6 +37,31 @@ #include #include +long poll_pending(void) +{ + unsigned long dummy; + return plpar_hcall(H_POLL_PENDING, 0, 0, 0, 0, + &dummy, &dummy, &dummy); +} + +long prod_processor(void) +{ + plpar_hcall_norets(H_PROD); + return(0); +} + +long cede_processor(void) +{ + plpar_hcall_norets(H_CEDE); + return(0); +} + +long register_vpa(unsigned long flags, unsigned long proc, unsigned long vpa) +{ + plpar_hcall_norets(H_REGISTER_VPA, flags, proc, vpa); + return(0); +} + long plpar_pte_remove(unsigned long flags, unsigned long ptex, unsigned long avpn, diff -puN arch/ppc64/kernel/setup.c~ppc64-sharedproc arch/ppc64/kernel/setup.c --- 25/arch/ppc64/kernel/setup.c~ppc64-sharedproc 2004-01-13 23:22:44.000000000 -0800 +++ 25-akpm/arch/ppc64/kernel/setup.c 2004-01-13 23:22:44.000000000 -0800 @@ -58,6 +58,7 @@ extern void iSeries_init_early( void ); extern void pSeries_init_early( void ); extern void pSeriesLP_init_early(void); extern void mm_init_ppc64( void ); +extern void vpa_init(int cpu); unsigned long decr_overclock = 1; unsigned long decr_overclock_proc0 = 1; @@ -211,6 +212,13 @@ void setup_system(unsigned long r3, unsi mm_init_ppc64(); + if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) { + vpa_init(boot_cpuid); + } + + /* Select the correct idle loop for the platform. */ + idle_setup(); + switch (systemcfg->platform) { #ifdef CONFIG_PPC_ISERIES case PLATFORM_ISERIES_LPAR: diff -puN arch/ppc64/kernel/smp.c~ppc64-sharedproc arch/ppc64/kernel/smp.c --- 25/arch/ppc64/kernel/smp.c~ppc64-sharedproc 2004-01-13 23:22:44.000000000 -0800 +++ 25-akpm/arch/ppc64/kernel/smp.c 2004-01-13 23:22:44.000000000 -0800 @@ -266,6 +266,16 @@ static void __init smp_space_timers(unsi } #ifdef CONFIG_PPC_PSERIES +void vpa_init(int cpu) { + unsigned long flags; + + /* Register the Virtual Processor Area (VPA) */ + printk(KERN_INFO "register_vpa: cpu 0x%x\n", cpu); + flags = 1UL << (63 - 18); + paca[cpu].xLpPaca.xSLBCount = 64; /* SLB restore highwater mark */ + register_vpa(flags, cpu, __pa((unsigned long)&(paca[cpu].xLpPaca))); +} + static void __devinit pSeries_setup_cpu(int cpu) { if (OpenPIC_Addr) { @@ -670,6 +680,12 @@ int __devinit start_secondary(void *unus if (smp_ops->take_timebase) smp_ops->take_timebase(); + get_paca()->yielded = 0; + + if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) { + vpa_init(cpu); + } + local_irq_enable(); return cpu_idle(NULL); diff -puN include/asm-ppc64/hvcall.h~ppc64-sharedproc include/asm-ppc64/hvcall.h --- 25/include/asm-ppc64/hvcall.h~ppc64-sharedproc 2004-01-13 23:22:44.000000000 -0800 +++ 25-akpm/include/asm-ppc64/hvcall.h 2004-01-13 23:22:44.000000000 -0800 @@ -9,6 +9,14 @@ #define H_PTEG_Full -6 /* PTEG is full */ #define H_Not_Found -7 /* PTE was not found" */ #define H_Reserved_DABR -8 /* DABR address is reserved by the hypervisor on this processor" */ +#define H_NoMem -9 +#define H_Authority -10 +#define H_Permission -11 +#define H_Dropped -12 +#define H_SourceParm -13 +#define H_DestParm -14 +#define H_RemoteParm -15 +#define H_Resource -16 /* Flags */ #define H_LARGE_PAGE (1UL<<(63-16)) @@ -58,6 +66,16 @@ #define H_IPOLL 0x70 #define H_XIRR 0x74 #define H_PERFMON 0x7c +#define H_MIGRATE_DMA 0x78 +#define H_REGISTER_VPA 0xDC +#define H_CEDE 0xE0 +#define H_CONFER 0xE4 +#define H_PROD 0xE8 +#define H_GET_PPP 0xEC +#define H_SET_PPP 0xF0 +#define H_SET_PURR 0xF4 +#define H_PIC 0xF8 +#define H_POLL_PENDING 0x1D8 /* plpar_hcall() -- Generic call interface using above opcodes * diff -puN include/asm-ppc64/iSeries/ItLpPaca.h~ppc64-sharedproc include/asm-ppc64/iSeries/ItLpPaca.h --- 25/include/asm-ppc64/iSeries/ItLpPaca.h~ppc64-sharedproc 2004-01-13 23:22:44.000000000 -0800 +++ 25-akpm/include/asm-ppc64/iSeries/ItLpPaca.h 2004-01-13 23:22:44.000000000 -0800 @@ -110,7 +110,10 @@ struct ItLpPaca u64 xPDCSavedSPRG1; // Saved SPRG1 for PMC int x68-x6F u64 xPDCSavedSRR0; // Saved SRR0 for PMC int x70-x77 volatile u32 xVirtualDecr; // Virtual DECR for shared procsx78-x7B - u32 xRsvd2_2; // Reserved x7C-x7F + u16 xSLBCount; // # of SLBs to maintain x7C-x7D + u8 xIdle; // Indicate OS is idle x7E + u8 xRsvd2_2; // Reserved x7F + //============================================================================= // CACHE_LINE_3 0x0100 - 0x007F: This line is shared with other processors diff -puN include/asm-ppc64/paca.h~ppc64-sharedproc include/asm-ppc64/paca.h --- 25/include/asm-ppc64/paca.h~ppc64-sharedproc 2004-01-13 23:22:44.000000000 -0800 +++ 25-akpm/include/asm-ppc64/paca.h 2004-01-13 23:22:44.000000000 -0800 @@ -94,7 +94,9 @@ struct paca_struct { u32 *prof_buffer; /* iSeries profiling buffer 0x38 */ u32 *prof_stext; /* iSeries start of kernel text 0x40 */ u32 prof_len; /* iSeries length of profile buffer -1 0x48 */ - u8 rsvd2[128-76]; /* 0x4C */ + u8 yielded; /* 0 = this processor is running 0x4c */ + /* 1 = this processor is yielded */ + u8 rsvd2[128-77]; /* 0x49 */ /*===================================================================================== * CACHE_LINE_3 0x0100 - 0x017F @@ -117,7 +119,7 @@ struct paca_struct { struct ItLpRegSave xRegSav; /* Register save for proc */ /*===================================================================================== - * CACHE_LINE_17-18 0x0800 - 0x0EFF Reserved + * CACHE_LINE_17-18 0x0800 - 0x08FF Reserved *===================================================================================== */ struct rtas_args xRtas; /* Per processor RTAS struct */ @@ -126,10 +128,12 @@ struct paca_struct { u8 rsvd5[256-16-sizeof(struct rtas_args)]; /*===================================================================================== - * CACHE_LINE_19-30 0x0800 - 0x0EFF Reserved + * CACHE_LINE_19-30 0x0900 - 0x0EFF Reserved *===================================================================================== */ - u8 rsvd6[0x600]; + u64 slb_shadow[0x20]; + u64 dispatch_log; + u8 rsvd6[0x500 - 0x8]; /*===================================================================================== * CACHE_LINE_31 0x0F00 - 0x0F7F Exception stack diff -puN include/asm-ppc64/memory.h~ppc64-sharedproc include/asm-ppc64/memory.h --- 25/include/asm-ppc64/memory.h~ppc64-sharedproc 2004-01-13 23:22:44.000000000 -0800 +++ 25-akpm/include/asm-ppc64/memory.h 2004-01-13 23:22:44.000000000 -0800 @@ -42,23 +42,28 @@ static inline void isync(void) #endif /* Macros for adjusting thread priority (hardware multi-threading) */ - -#if defined(CONFIG_PPC_ISERIES) || defined(CONFIG_HMT) +#define HMT_very_low() asm volatile("or 31,31,31 # very low priority") #define HMT_low() asm volatile("or 1,1,1 # low priority") +#define HMT_medium_low() asm volatile("or 6,6,6 # medium low priority") #define HMT_medium() asm volatile("or 2,2,2 # medium priority") +#define HMT_medium_high() asm volatile("or 5,5,5 # medium high priority") #define HMT_high() asm volatile("or 3,3,3 # high priority") +#define HMT_VERY_LOW "\tor 31,31,31 # very low priority\n" #define HMT_LOW "\tor 1,1,1 # low priority\n" +#define HMT_MEDIUM_LOW "\tor 6,6,6 # medium low priority\n" #define HMT_MEDIUM "\tor 2,2,2 # medium priority\n" +#define HMT_MEDIUM_HIGH "\tor 5,5,5 # medium high priority\n" #define HMT_HIGH "\tor 3,3,3 # high priority\n" -#else -#define HMT_low() do { } while(0) -#define HMT_medium() do { } while(0) -#define HMT_high() do { } while(0) -#define HMT_LOW -#define HMT_MEDIUM -#define HMT_HIGH -#endif +/* + * Various operational modes for SMT + * Off : never run threaded + * On : always run threaded + * Dynamic: Allow the system to switch modes as needed + */ +#define SMT_OFF 0 +#define SMT_ON 1 +#define SMT_DYNAMIC 2 #endif diff -puN include/asm-ppc64/naca.h~ppc64-sharedproc include/asm-ppc64/naca.h --- 25/include/asm-ppc64/naca.h~ppc64-sharedproc 2004-01-13 23:22:44.000000000 -0800 +++ 25-akpm/include/asm-ppc64/naca.h 2004-01-13 23:22:44.000000000 -0800 @@ -37,7 +37,12 @@ struct naca_struct { u32 dCacheL1LinesPerPage; /* L1 d-cache lines / page 0x64 */ u32 iCacheL1LogLineSize; /* L1 i-cache line size Log2 0x68 */ u32 iCacheL1LinesPerPage; /* L1 i-cache lines / page 0x6c */ - u64 resv0[2]; /* Reserved 0x70 - 0x7F */ + u64 smt_snooze_delay; /* Delay (in usec) before 0x70 */ + /* entering ST mode */ + u8 smt_state; /* 0 = SMT off 0x78 */ + /* 1 = SMT on */ + /* 2 = SMT dynamic */ + u8 resv0[7]; /* Reserved 0x70 - 0x7F */ }; extern struct naca_struct *naca; diff -puN include/asm-ppc64/processor.h~ppc64-sharedproc include/asm-ppc64/processor.h --- 25/include/asm-ppc64/processor.h~ppc64-sharedproc 2004-01-13 23:22:44.000000000 -0800 +++ 25-akpm/include/asm-ppc64/processor.h 2004-01-13 23:22:44.000000000 -0800 @@ -378,6 +378,7 @@ #define PLATFORM_PSERIES 0x0100 #define PLATFORM_PSERIES_LPAR 0x0101 #define PLATFORM_ISERIES_LPAR 0x0201 +#define PLATFORM_LPAR 0x0001 /* * List of interrupt controllers. diff -puN arch/ppc64/kernel/chrp_setup.c~ppc64-sharedproc arch/ppc64/kernel/chrp_setup.c --- 25/arch/ppc64/kernel/chrp_setup.c~ppc64-sharedproc 2004-01-13 23:22:44.000000000 -0800 +++ 25-akpm/arch/ppc64/kernel/chrp_setup.c 2004-01-13 23:22:44.000000000 -0800 @@ -273,7 +273,7 @@ chrp_init(unsigned long r3, unsigned lon ppc_md.progress = chrp_progress; - /* build up the firmware_features bitmask field + /* Build up the firmware_features bitmask field * using contents of device-tree/ibm,hypertas-functions. * Ultimately this functionality may be moved into prom.c prom_init(). */ diff -puN arch/ppc64/kernel/idle.c~ppc64-sharedproc arch/ppc64/kernel/idle.c --- 25/arch/ppc64/kernel/idle.c~ppc64-sharedproc 2004-01-13 23:22:44.000000000 -0800 +++ 25-akpm/arch/ppc64/kernel/idle.c 2004-01-13 23:22:44.000000000 -0800 @@ -1,5 +1,13 @@ /* - * idle.c + * Idle daemon for PowerPC. Idle daemon will handle any action + * that needs to be taken when the system becomes idle. + * + * Originally Written by Cort Dougan (cort@cs.nmt.edu) + * + * iSeries supported added by Mike Corrigan + * + * Additional shared processor, SMT, and firmware support + * Copyright (c) 2003 Dave Engebretsen * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -26,99 +34,104 @@ #include #include #include +#include #include - -#ifdef CONFIG_PPC_ISERIES - #include #include #include -unsigned long maxYieldTime = 0; -unsigned long minYieldTime = 0xffffffffffffffffUL; +extern long cede_processor(void); +extern long poll_pending(void); + +int (*idle_loop)(void); +#ifdef CONFIG_PPC_ISERIES static void yield_shared_processor(void) { - unsigned long tb; - unsigned long yieldTime; + struct paca_struct *lpaca = get_paca(); HvCall_setEnabledInterrupts(HvCall_MaskIPI | HvCall_MaskLpEvent | HvCall_MaskLpProd | HvCall_MaskTimeout); - tb = get_tb(); - /* Compute future tb value when yield should expire */ - HvCall_yieldProcessor(HvCall_YieldTimed, tb+tb_ticks_per_jiffy); - - yieldTime = get_tb() - tb; - if (yieldTime > maxYieldTime) - maxYieldTime = yieldTime; + if (!ItLpQueue_isLpIntPending(paca->lpQueuePtr)) { + /* + * Compute future tb value when yield should expire. + * We want to be woken up when the next decrementer is + * to fire. + */ + + local_irq_disable(); + lpaca->yielded = 1; /* Indicate a prod is desired */ + lpaca->xLpPaca.xIdle = 1; /* Inform the HV we are idle */ + + HvCall_yieldProcessor(HvCall_YieldTimed, + lpaca->next_jiffy_update_tb); - if (yieldTime < minYieldTime) - minYieldTime = yieldTime; + lpaca->yielded = 0; /* Back to IPI's */ + locale_irq_enable(); - /* - * The decrementer stops during the yield. Force a fake decrementer - * here and let the timer_interrupt code sort out the actual time. - */ - get_paca()->xLpPaca.xIntDword.xFields.xDecrInt = 1; + /* + * The decrementer stops during the yield. Force a fake + * decrementer here and let the timer_interrupt code sort + * out the actual time. + */ + lpaca->xLpPaca.xIntDword.xFields.xDecrInt = 1; + } + process_iSeries_events(); } -int cpu_idle(void) +int iSeries_idle(void) { struct paca_struct *lpaca; long oldval; unsigned long CTRL; -#warning fix iseries run light -#if 0 + /* endless loop with no priority at all */ + current->nice = 20; + current->counter = -100; + /* ensure iSeries run light will be out when idle */ current->thread.flags &= ~PPC_FLAG_RUN_LIGHT; CTRL = mfspr(CTRLF); CTRL &= ~RUNLATCH; mtspr(CTRLT, CTRL); -#endif + init_idle(); lpaca = get_paca(); - while (1) { + for (;;) { if (lpaca->xLpPaca.xSharedProc) { if (ItLpQueue_isLpIntPending(lpaca->lpQueuePtr)) process_iSeries_events(); - if (!need_resched()) + if (!current->need_resched) yield_shared_processor(); } else { - oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED); - + /* Avoid an IPI by setting need_resched */ + oldval = xchg(¤t->need_resched, -1); if (!oldval) { - set_thread_flag(TIF_POLLING_NRFLAG); - - while (!need_resched()) { + while(current->need_resched == -1) { HMT_medium(); if (ItLpQueue_isLpIntPending(lpaca->lpQueuePtr)) process_iSeries_events(); HMT_low(); } - - HMT_medium(); - clear_thread_flag(TIF_POLLING_NRFLAG); - } else { - set_need_resched(); } } - - if (need_resched()) + HMT_medium(); + if (current->need_resched) { + lpaca->xLpPaca.xIdle = 0; schedule(); + check_pgt_cache(); + } } - return 0; } +#endif -#else /* CONFIG_PPC_ISERIES */ - -int cpu_idle(void) +int default_idle(void) { long oldval; @@ -145,9 +158,153 @@ int cpu_idle(void) return 0; } -#endif /* CONFIG_PPC_ISERIES */ +int dedicated_idle(void) +{ + long oldval; + struct paca_struct *lpaca = get_paca(), *ppaca;; + unsigned long start_snooze; + + ppaca = &paca[(lpaca->xPacaIndex) ^ 1]; + + while (1) { + /* Indicate to the HV that we are idle. Now would be + * a good time to find other work to dispatch. */ + lpaca->xLpPaca.xIdle = 1; + + oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED); + if (!oldval) { + set_thread_flag(TIF_POLLING_NRFLAG); + start_snooze = __get_tb(); + while (!need_resched()) { + /* need_resched could be 1 or 0 at this + * point. If it is 0, set it to 0, so + * an IPI/Prod is sent. If it is 1, keep + * it that way & schedule work. + */ + if (__get_tb() < + (start_snooze + + naca->smt_snooze_delay*tb_ticks_per_usec)) { + HMT_low(); /* Low thread priority */ + continue; + } + + HMT_very_low(); /* Low power mode */ -void default_idle(void) + /* If the SMT mode is system controlled & the + * partner thread is doing work, switch into + * ST mode. + */ + if((naca->smt_state == SMT_DYNAMIC) && + (!(ppaca->xLpPaca.xIdle))) { + /* Indicate we are no longer polling for + * work, and then clear need_resched. If + * need_resched was 1, set it back to 1 + * and schedule work + */ + clear_thread_flag(TIF_POLLING_NRFLAG); + oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED); + if(oldval == 1) { + set_need_resched(); + break; + } + + /* DRENG: Go HMT_medium here ? */ + local_irq_disable(); + lpaca->yielded = 1; + + /* SMT dynamic mode. Cede will result + * in this thread going dormant, if the + * partner thread is still doing work. + * Thread wakes up if partner goes idle, + * an interrupt is presented, or a prod + * occurs. Returning from the cede + * enables external interrupts. + */ + cede_processor(); + + lpaca->yielded = 0; + } else { + /* Give the HV an opportunity at the + * processor, since we are not doing + * any work. + */ + poll_pending(); + } + } + } else { + set_need_resched(); + } + + HMT_medium(); + lpaca->xLpPaca.xIdle = 0; + schedule(); + } + return 0; +} + +int shared_idle(void) { - barrier(); + struct paca_struct *lpaca = get_paca(); + + while (1) { + /* Indicate to the HV that we are idle. Now would be + * a good time to find other work to dispatch. */ + lpaca->xLpPaca.xIdle = 1; + + if (!need_resched()) { + local_irq_disable(); + lpaca->yielded = 1; + + /* + * Yield the processor to the hypervisor. We return if + * an external interrupt occurs (which are driven prior + * to returning here) or if a prod occurs from another + * processor. When returning here, external interrupts + * are enabled. + */ + cede_processor(); + + lpaca->yielded = 0; + } + + HMT_medium(); + lpaca->xLpPaca.xIdle = 0; + schedule(); + } + + return 0; +} + +int cpu_idle(void) +{ + idle_loop(); + return 0; } + +int idle_setup(void) +{ +#ifdef CONFIG_PPC_ISERIES + idle_loop = iSeries_idle; +#else + if (systemcfg->platform & PLATFORM_PSERIES) { + if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) { + if(get_paca()->xLpPaca.xSharedProc) { + printk("idle = shared_idle\n"); + idle_loop = shared_idle; + } else { + printk("idle = dedicated_idle\n"); + idle_loop = dedicated_idle; + } + } else { + printk("idle = default_idle\n"); + idle_loop = default_idle; + } + } else { + printk("idle_setup: unknown platform, use default_idle\n"); + idle_loop = default_idle; + } +#endif + + return 1; +} + _