1 /* smp.c: Sparc64 SMP support. 3 * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) 6 #include <linux/kernel.h> 7 #include <linux/sched.h> 9 #include <linux/pagemap.h> 10 #include <linux/threads.h> 11 #include <linux/smp.h> 12 #include <linux/smp_lock.h> 13 #include <linux/interrupt.h> 14 #include <linux/kernel_stat.h> 15 #include <linux/delay.h> 16 #include <linux/init.h> 17 #include <linux/spinlock.h> 20 #include <asm/ptrace.h> 21 #include <asm/atomic.h> 25 #include <asm/pgtable.h> 26 #include <asm/oplib.h> 27 #include <asm/hardirq.h> 28 #include <asm/softirq.h> 29 #include <asm/uaccess.h> 30 #include <asm/timer.h> 32 #define __KERNEL_SYSCALLS__ 33 #include <linux/unistd.h> 35 externint linux_num_cpus
; 36 externvoidcalibrate_delay(void); 37 externunsigned prom_cpu_nodes
[]; 39 struct cpuinfo_sparc cpu_data
[NR_CPUS
]__attribute__((aligned(64))); 41 volatileint cpu_number_map
[NR_CPUS
]__attribute__((aligned(64))); 42 volatileint __cpu_logical_map
[NR_CPUS
]__attribute__((aligned(64))); 44 /* Please don't make this stuff initdata!!! --DaveM */ 45 static unsigned char boot_cpu_id
=0; 46 static int smp_activated
=0; 49 spinlock_t kernel_flag
= SPIN_LOCK_UNLOCKED
; 51 volatileint smp_processors_ready
=0; 52 unsigned long cpu_present_map
=0; 54 int smp_threads_ready
=0; 56 void __init
smp_setup(char*str
,int*ints
) 58 /* XXX implement me XXX */ 65 strcpy(buf
,"State:\n"); 66 for(i
=0; i
< NR_CPUS
; i
++) 67 if(cpu_present_map
& (1UL<< i
)) 68 len
+=sprintf(buf
+ len
, 69 "CPU%d:\t\tonline\n", i
); 77 for(i
=0; i
< NR_CPUS
; i
++) 78 if(cpu_present_map
& (1UL<< i
)) 79 len
+=sprintf(buf
+ len
, 80 "Cpu%dBogo\t: %lu.%02lu\n", 81 i
, cpu_data
[i
].udelay_val
/500000, 82 (cpu_data
[i
].udelay_val
/5000) %100); 86 void __init
smp_store_cpu_info(int id
) 90 cpu_data
[id
].irq_count
=0; 91 cpu_data
[id
].bh_count
=0; 92 /* multiplier and counter set by 93 smp_setup_percpu_timer() */ 94 cpu_data
[id
].udelay_val
= loops_per_sec
; 96 cpu_data
[id
].pgcache_size
=0; 97 cpu_data
[id
].pte_cache
= NULL
; 98 cpu_data
[id
].pgdcache_size
=0; 99 cpu_data
[id
].pgd_cache
= NULL
; 100 cpu_data
[id
].idle_volume
=1; 102 for(i
=0; i
<16; i
++) 103 cpu_data
[id
].irq_worklists
[i
] =0; 106 void __init
smp_commence(void) 110 static voidsmp_setup_percpu_timer(void); 111 static voidsmp_tune_scheduling(void); 113 staticvolatileunsigned long callin_flag
=0; 115 externvoidinherit_locked_prom_mappings(int save_p
); 116 externvoidcpu_probe(void); 118 void __init
smp_callin(void) 120 int cpuid
=hard_smp_processor_id(); 122 inherit_locked_prom_mappings(0); 129 /* Master did this already, now is the time for us to do it. */ 130 __asm__
__volatile__(" 131 sethi %%hi(0x80000000), %%g1 135 andn %%g2, %%g1, %%g2 141 smp_setup_percpu_timer(); 146 smp_store_cpu_info(cpuid
); 148 __asm__
__volatile__("membar #Sync\n\t" 149 "flush %%g6": : :"memory"); 151 /* Clear this or we will die instantly when we 152 * schedule back to this idler... 154 current
->thread
.flags
&= ~(SPARC_FLAG_NEWCHILD
); 156 /* Attach to the address space of init_task. */ 157 atomic_inc(&init_mm
.mm_count
); 158 current
->active_mm
= &init_mm
; 160 while(!smp_processors_ready
) 164 externintcpu_idle(void); 165 externvoidinit_IRQ(void); 167 voidinitialize_secondary(void) 171 intstart_secondary(void*unused
) 181 printk("CPU[%d]: Returns from cpu_idle!\n",smp_processor_id()); 182 panic("SMP bolixed\n"); 185 externstruct prom_cpuinfo linux_cpus
[64]; 187 externunsigned long smp_trampoline
; 189 /* The OBP cpu startup callback truncates the 3rd arg cookie to 190 * 32-bits (I think) so to be safe we have it read the pointer 191 * contained here so we work on >4GB machines. -DaveM 193 static struct task_struct
*cpu_new_task
= NULL
; 195 void __init
smp_boot_cpus(void) 199 printk("Entering UltraSMPenguin Mode...\n"); 201 smp_store_cpu_info(boot_cpu_id
); 202 smp_tune_scheduling(); 205 if(linux_num_cpus
==1) 208 for(i
=0; i
< NR_CPUS
; i
++) { 212 if(cpu_present_map
& (1UL<< i
)) { 213 unsigned long entry
= (unsigned long)(&smp_trampoline
); 214 unsigned long cookie
= (unsigned long)(&cpu_new_task
); 215 struct task_struct
*p
; 218 externunsigned long phys_base
; 220 entry
+= phys_base
- KERNBASE
; 221 cookie
+= phys_base
- KERNBASE
; 222 kernel_thread(start_secondary
, NULL
, CLONE_PID
); 225 p
= init_task
.prev_task
; 226 init_tasks
[cpucount
] = p
; 229 p
->has_cpu
=1;/* we schedule the first task manually */ 231 del_from_runqueue(p
); 235 for(no
=0; no
< linux_num_cpus
; no
++) 236 if(linux_cpus
[no
].mid
== i
) 239 prom_startcpu(linux_cpus
[no
].prom_node
, 241 for(timeout
=0; timeout
<5000000; timeout
++) { 247 cpu_number_map
[i
] = cpucount
; 248 __cpu_logical_map
[cpucount
] = i
; 249 prom_cpu_nodes
[i
] = linux_cpus
[no
].prom_node
; 252 printk("Processor %d is stuck.\n", i
); 256 cpu_present_map
&= ~(1UL<< i
); 257 cpu_number_map
[i
] = -1; 262 printk("Error: only one processor found.\n"); 263 cpu_present_map
= (1UL<<smp_processor_id()); 265 unsigned long bogosum
=0; 267 for(i
=0; i
< NR_CPUS
; i
++) { 268 if(cpu_present_map
& (1UL<< i
)) 269 bogosum
+= cpu_data
[i
].udelay_val
; 271 printk("Total of %d processors activated (%lu.%02lu BogoMIPS).\n", 273 (bogosum
+2500)/500000, 274 ((bogosum
+2500)/5000)%100); 276 smp_num_cpus
= cpucount
+1; 278 smp_processors_ready
=1; 279 membar("#StoreStore | #StoreLoad"); 282 /* #define XCALL_DEBUG */ 284 staticinlinevoidxcall_deliver(u64 data0
, u64 data1
, u64 data2
, u64 pstate
,unsigned long cpu
) 286 u64 result
, target
= (cpu
<<14) |0x70; 290 printk("CPU[%d]: xcall(data[%016lx:%016lx:%016lx],tgt[%016lx])\n", 291 smp_processor_id(), data0
, data1
, data2
, target
); 295 __asm__
__volatile__(" 296 wrpr %1, %2, %%pstate 305 :"r"(pstate
),"i"(PSTATE_IE
),"i"(ASI_UDB_INTR_W
), 306 "r"(data0
),"r"(data1
),"r"(data2
),"r"(target
),"r"(0x10),"0"(tmp
)); 308 /* NOTE: PSTATE_IE is still clear. */ 311 __asm__
__volatile__("ldxa [%%g0] %1, %0" 313 :"i"(ASI_INTR_DISPATCH_STAT
)); 315 __asm__
__volatile__("wrpr %0, 0x0, %%pstate" 323 __asm__
__volatile__("wrpr %0, 0x0, %%pstate" 327 printk("CPU[%d]: mondo stuckage result[%016lx]\n", 328 smp_processor_id(), result
); 332 printk("CPU[%d]: Penguin %d NACK's master.\n",smp_processor_id(), cpu
); 339 voidsmp_cross_call(unsigned long*func
, u32 ctx
, u64 data1
, u64 data2
) 341 if(smp_processors_ready
) { 342 unsigned long mask
= (cpu_present_map
& ~(1UL<<smp_processor_id())); 343 u64 pstate
, data0
= (((u64
)ctx
)<<32| (((u64
)func
) &0xffffffff)); 344 int i
, ncpus
= smp_num_cpus
-1; 346 __asm__
__volatile__("rdpr %%pstate, %0":"=r"(pstate
)); 347 for(i
=0; i
< NR_CPUS
; i
++) { 348 if(mask
& (1UL<< i
)) { 349 xcall_deliver(data0
, data1
, data2
, pstate
, i
); 354 /* NOTE: Caller runs local copy on master. */ 358 externunsigned long xcall_flush_tlb_page
; 359 externunsigned long xcall_flush_tlb_mm
; 360 externunsigned long xcall_flush_tlb_range
; 361 externunsigned long xcall_flush_tlb_all
; 362 externunsigned long xcall_tlbcachesync
; 363 externunsigned long xcall_flush_cache_all
; 364 externunsigned long xcall_report_regs
; 365 externunsigned long xcall_receive_signal
; 367 voidsmp_receive_signal(int cpu
) 369 if(smp_processors_ready
&& 370 (cpu_present_map
& (1UL<<cpu
)) !=0) { 371 u64 pstate
, data0
= (((u64
)&xcall_receive_signal
) &0xffffffff); 372 __asm__
__volatile__("rdpr %%pstate, %0":"=r"(pstate
)); 373 xcall_deliver(data0
,0,0, pstate
, cpu
); 377 voidsmp_report_regs(void) 379 smp_cross_call(&xcall_report_regs
,0,0,0); 382 voidsmp_flush_cache_all(void) 384 smp_cross_call(&xcall_flush_cache_all
,0,0,0); 388 voidsmp_flush_tlb_all(void) 390 smp_cross_call(&xcall_flush_tlb_all
,0,0,0); 394 /* We know that the window frames of the user have been flushed 395 * to the stack before we get here because all callers of us 396 * are flush_tlb_*() routines, and these run after flush_cache_*() 397 * which performs the flushw. 399 * XXX I diked out the fancy flush avoidance code for the 400 * XXX swapping cases for now until the new MM code stabilizes. -DaveM 402 * The SMP TLB coherency scheme we use works as follows: 404 * 1) mm->cpu_vm_mask is a bit mask of which cpus an address 405 * space has (potentially) executed on, this is the heuristic 406 * we use to avoid doing cross calls. 408 * 2) TLB context numbers are shared globally across all processors 409 * in the system, this allows us to play several games to avoid 412 * One invariant is that when a cpu switches to a process, and 413 * that processes tsk->active_mm->cpu_vm_mask does not have the 414 * current cpu's bit set, that tlb context is flushed locally. 416 * If the address space is non-shared (ie. mm->count == 1) we avoid 417 * cross calls when we want to flush the currently running process's 418 * tlb state. This is done by clearing all cpu bits except the current 419 * processor's in current->active_mm->cpu_vm_mask and performing the 420 * flush locally only. This will force any subsequent cpus which run 421 * this task to flush the context from the local tlb if the process 422 * migrates to another cpu (again). 424 * 3) For shared address spaces (threads) and swapping we bite the 425 * bullet for most cases and perform the cross call. 427 * The performance gain from "optimizing" away the cross call for threads is 428 * questionable (in theory the big win for threads is the massive sharing of 429 * address space state across processors). 431 * For the swapping case the locking is difficult to get right, we'd have to 432 * enforce strict ordered access to mm->cpu_vm_mask via a spinlock for example. 433 * Then again one could argue that when you are swapping, the cost of a cross 434 * call won't even show up on the performance radar. But in any case we do get 435 * rid of the cross-call when the task has a dead context or the task has only 436 * ever run on the local cpu. 438 voidsmp_flush_tlb_mm(struct mm_struct
*mm
) 440 u32 ctx
=CTX_HWBITS(mm
->context
); 442 if(mm
== current
->active_mm
&& 443 atomic_read(&mm
->mm_users
) ==1&& 444 (mm
->cpu_vm_mask
== (1UL<<smp_processor_id()))) 445 goto local_flush_and_out
; 447 smp_cross_call(&xcall_flush_tlb_mm
, ctx
,0,0); 450 __flush_tlb_mm(ctx
, SECONDARY_CONTEXT
); 453 voidsmp_flush_tlb_range(struct mm_struct
*mm
,unsigned long start
, 456 u32 ctx
=CTX_HWBITS(mm
->context
); 460 if(mm
== current
->active_mm
&& 461 atomic_read(&mm
->mm_users
) ==1&& 462 (mm
->cpu_vm_mask
== (1UL<<smp_processor_id()))) 463 goto local_flush_and_out
; 465 smp_cross_call(&xcall_flush_tlb_range
, ctx
, start
, end
); 468 __flush_tlb_range(ctx
, start
, SECONDARY_CONTEXT
, end
, PAGE_SIZE
, (end
-start
)); 471 voidsmp_flush_tlb_page(struct mm_struct
*mm
,unsigned long page
) 473 u32 ctx
=CTX_HWBITS(mm
->context
); 476 if(mm
== current
->active_mm
&& 477 atomic_read(&mm
->mm_users
) ==1&& 478 (mm
->cpu_vm_mask
== (1UL<<smp_processor_id()))) { 479 goto local_flush_and_out
; 482 smp_cross_call(&xcall_flush_tlb_page
, ctx
, page
,0); 485 __flush_tlb_page(ctx
, page
, SECONDARY_CONTEXT
); 489 /* #define CAPTURE_DEBUG */ 490 externunsigned long xcall_capture
; 492 static atomic_t smp_capture_depth
=ATOMIC_INIT(0); 493 static atomic_t smp_capture_registry
=ATOMIC_INIT(0); 494 static unsigned long penguins_are_doing_time
=0; 496 voidsmp_capture(void) 498 if(smp_processors_ready
) { 499 int result
=atomic_add_return(1, &smp_capture_depth
); 501 membar("#StoreStore | #LoadStore"); 503 int ncpus
= smp_num_cpus
; 506 printk("CPU[%d]: Sending penguins to jail...", 509 penguins_are_doing_time
=1; 510 membar("#StoreStore | #LoadStore"); 511 atomic_inc(&smp_capture_registry
); 512 smp_cross_call(&xcall_capture
,0,0,0); 513 while(atomic_read(&smp_capture_registry
) != ncpus
) 522 voidsmp_release(void) 524 if(smp_processors_ready
) { 525 if(atomic_dec_and_test(&smp_capture_depth
)) { 527 printk("CPU[%d]: Giving pardon to imprisoned penguins\n", 530 penguins_are_doing_time
=0; 531 membar("#StoreStore | #StoreLoad"); 532 atomic_dec(&smp_capture_registry
); 537 /* Imprisoned penguins run with %pil == 15, but PSTATE_IE set, so they 538 * can service tlb flush xcalls... 540 voidsmp_penguin_jailcell(void) 543 atomic_inc(&smp_capture_registry
); 544 membar("#StoreLoad | #StoreStore"); 545 while(penguins_are_doing_time
) 547 atomic_dec(&smp_capture_registry
); 550 staticinlinevoidsparc64_do_profile(unsigned long pc
,unsigned long g3
) 552 if(prof_buffer
&& current
->pid
) { 554 externint rwlock_impl_begin
, rwlock_impl_end
; 555 externint atomic_impl_begin
, atomic_impl_end
; 557 if((pc
>= (unsigned long) &rwlock_impl_begin
&& 558 pc
< (unsigned long) &rwlock_impl_end
) || 559 (pc
>= (unsigned long) &atomic_impl_begin
&& 560 pc
< (unsigned long) &atomic_impl_end
)) 563 pc
-= (unsigned long) &_stext
; 568 atomic_inc((atomic_t
*)&prof_buffer
[pc
]); 572 static unsigned long current_tick_offset
; 574 #define prof_multiplier(__cpu) cpu_data[(__cpu)].multiplier 575 #define prof_counter(__cpu) cpu_data[(__cpu)].counter 577 externvoidupdate_one_process(struct task_struct
*p
,unsigned long ticks
, 578 unsigned long user
,unsigned long system
, 581 voidsmp_percpu_timer_interrupt(struct pt_regs
*regs
) 583 unsigned long compare
, tick
; 584 int cpu
=smp_processor_id(); 585 int user
=user_mode(regs
); 588 * Check for level 14 softint. 590 if(!(get_softint() & (1UL<<0))) { 591 externvoidhandler_irq(int,struct pt_regs
*); 593 handler_irq(14, regs
); 597 clear_softint((1UL<<0)); 600 sparc64_do_profile(regs
->tpc
, regs
->u_regs
[UREG_G3
]); 601 if(!--prof_counter(cpu
)) 603 if(cpu
== boot_cpu_id
) { 604 /* XXX Keep this in sync with irq.c --DaveM */ 605 #define irq_enter(cpu, irq) \ 606 do { hardirq_enter(cpu); \ 607 spin_unlock_wait(&global_irq_lock); \ 609 #define irq_exit(cpu, irq) hardirq_exit(cpu) 612 kstat
.irqs
[cpu
][0]++; 614 timer_tick_interrupt(regs
); 623 unsigned int*inc
, *inc2
; 625 update_one_process(current
,1, user
, !user
, cpu
); 626 if(--current
->counter
<=0) { 628 current
->need_resched
=1; 632 if(current
->priority
< DEF_PRIORITY
) { 633 inc
= &kstat
.cpu_nice
; 634 inc2
= &kstat
.per_cpu_nice
[cpu
]; 636 inc
= &kstat
.cpu_user
; 637 inc2
= &kstat
.per_cpu_user
[cpu
]; 640 inc
= &kstat
.cpu_system
; 641 inc2
= &kstat
.per_cpu_system
[cpu
]; 643 atomic_inc((atomic_t
*)inc
); 644 atomic_inc((atomic_t
*)inc2
); 646 prof_counter(cpu
) =prof_multiplier(cpu
); 649 __asm__
__volatile__("rd %%tick_cmpr, %0\n\t" 651 "wr %0, 0x0, %%tick_cmpr\n\t" 653 :"=&r"(compare
),"=r"(tick
) 654 :"r"(current_tick_offset
)); 655 }while(tick
>= compare
); 658 static void __init
smp_setup_percpu_timer(void) 660 int cpu
=smp_processor_id(); 662 prof_counter(cpu
) =prof_multiplier(cpu
) =1; 664 __asm__
__volatile__("rd %%tick, %%g1\n\t" 665 "add %%g1, %0, %%g1\n\t" 666 "wr %%g1, 0x0, %%tick_cmpr" 668 :"r"(current_tick_offset
) 672 void __init
smp_tick_init(void) 676 boot_cpu_id
=hard_smp_processor_id(); 677 current_tick_offset
= timer_tick_offset
; 679 for(i
=0; i
< linux_num_cpus
; i
++) 680 cpu_present_map
|= (1UL<< linux_cpus
[i
].mid
); 681 for(i
=0; i
< NR_CPUS
; i
++) { 682 cpu_number_map
[i
] = -1; 683 __cpu_logical_map
[i
] = -1; 685 cpu_number_map
[boot_cpu_id
] =0; 686 prom_cpu_nodes
[boot_cpu_id
] = linux_cpus
[0].prom_node
; 687 __cpu_logical_map
[0] = boot_cpu_id
; 688 current
->processor
= boot_cpu_id
; 689 prof_counter(boot_cpu_id
) =prof_multiplier(boot_cpu_id
) =1; 692 staticinlineunsigned longfind_flush_base(unsigned long size
) 694 struct page
*p
= mem_map
; 695 unsigned long found
, base
; 697 size
=PAGE_ALIGN(size
); 699 base
=page_address(p
); 702 if(p
>= (mem_map
+ max_mapnr
)) 706 base
=page_address(p
); 716 cycles_t cacheflush_time
; 718 static void __init
smp_tune_scheduling(void) 720 unsigned long flush_base
, flags
, *p
; 721 unsigned int ecache_size
; 722 cycles_t tick1
, tick2
, raw
; 724 /* Approximate heuristic for SMP scheduling. It is an 725 * estimation of the time it takes to flush the L2 cache 726 * on the local processor. 728 * The ia32 chooses to use the L1 cache flush time instead, 729 * and I consider this complete nonsense. The Ultra can service 730 * a miss to the L1 with a hit to the L2 in 7 or 8 cycles, and 731 * L2 misses are what create extra bus traffic (ie. the "cost" 732 * of moving a process from one cpu to another). 734 printk("SMP: Calibrating ecache flush... "); 735 ecache_size
=prom_getintdefault(linux_cpus
[0].prom_node
, 736 "ecache-size", (512*1024)); 737 flush_base
=find_flush_base(ecache_size
<<1); 739 if(flush_base
!=0UL) { 740 __save_and_cli(flags
); 742 /* Scan twice the size once just to get the TLB entries 743 * loaded and make sure the second scan measures pure misses. 745 for(p
= (unsigned long*)flush_base
; 746 ((unsigned long)p
) < (flush_base
+ (ecache_size
<<1)); 747 p
+= (64/sizeof(unsigned long))) 748 *((volatileunsigned long*)p
); 750 /* Now the real measurement. */ 751 __asm__
__volatile__(" 756 1: ldx [%2 + 0x000], %%g1 757 ldx [%2 + 0x040], %%g2 758 ldx [%2 + 0x080], %%g3 759 ldx [%2 + 0x0c0], %%g5 766 :"=&r"(tick1
),"=&r"(tick2
),"=&r"(flush_base
) 767 :"2"(flush_base
),"r"(flush_base
+ ecache_size
) 768 :"g1","g2","g3","g5"); 770 __restore_flags(flags
); 772 raw
= (tick2
- tick1
); 774 /* Dampen it a little, considering two processes 775 * sharing the cache and fitting. 777 cacheflush_time
= (raw
- (raw
>>2)); 779 cacheflush_time
= ((ecache_size
<<2) + 782 printk("Using heuristic of %d cycles.\n", 783 (int) cacheflush_time
); 786 /* /proc/profile writes can call this, don't __init it please. */ 787 intsetup_profiling_timer(unsigned int multiplier
) 792 if((!multiplier
) || (timer_tick_offset
/ multiplier
) <1000) 796 for(i
=0; i
< NR_CPUS
; i
++) { 797 if(cpu_present_map
& (1UL<< i
)) 798 prof_multiplier(i
) = multiplier
; 800 current_tick_offset
= (timer_tick_offset
/ multiplier
); 801 restore_flags(flags
);