arch/sparc64/kernel/smp.c

Name: Public Git Hosting - davej-history.git/blob - arch/sparc64/kernel/smp.c
Rating: 4.7 (8486 reviews)
 1 /* smp.c: Sparc64 SMP support.
 2  *
 3  * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
 4  */
 5
 6 #include <linux/kernel.h>
 7 #include <linux/sched.h>
 8 #include <linux/mm.h>
 9 #include <linux/pagemap.h>
 10 #include <linux/threads.h>
 11 #include <linux/smp.h>
 12 #include <linux/smp_lock.h>
 13 #include <linux/interrupt.h>
 14 #include <linux/kernel_stat.h>
 15 #include <linux/delay.h>
 16 #include <linux/init.h>
 17 #include <linux/spinlock.h>
 18
 19 #include <asm/head.h>
 20 #include <asm/ptrace.h>
 21 #include <asm/atomic.h>
 22
 23 #include <asm/irq.h>
 24 #include <asm/page.h>
 25 #include <asm/pgtable.h>
 26 #include <asm/oplib.h>
 27 #include <asm/hardirq.h>
 28 #include <asm/softirq.h>
 29 #include <asm/uaccess.h>
 30 #include <asm/timer.h>
 31
 32 #define __KERNEL_SYSCALLS__
 33 #include <linux/unistd.h>
 34
 35 externint linux_num_cpus;
 36 externvoidcalibrate_delay(void);
 37 externunsigned prom_cpu_nodes[];
 38
 39 struct cpuinfo_sparc cpu_data[NR_CPUS]__attribute__((aligned(64)));
 40
 41 volatileint cpu_number_map[NR_CPUS]__attribute__((aligned(64)));
 42 volatileint __cpu_logical_map[NR_CPUS]__attribute__((aligned(64)));
 43
 44 /* Please don't make this stuff initdata!!! --DaveM */
 45 static unsigned char boot_cpu_id =0;
 46 static int smp_activated =0;
 47
 48 /* Kernel spinlock */
 49 spinlock_t kernel_flag = SPIN_LOCK_UNLOCKED;
 50
 51 volatileint smp_processors_ready =0;
 52 unsigned long cpu_present_map =0;
 53 int smp_num_cpus =1;
 54 int smp_threads_ready =0;
 55
 56 void __init smp_setup(char*str,int*ints)
 57 {
 58 /* XXX implement me XXX */
 59 }
 60
 61 intsmp_info(char*buf)
 62 {
 63 int len =7, i;
 64
 65 strcpy(buf,"State:\n");
 66 for(i =0; i < NR_CPUS; i++)
 67 if(cpu_present_map & (1UL<< i))
 68  len +=sprintf(buf + len,
 69 "CPU%d:\t\tonline\n", i);
 70 return len;
 71 }
 72
 73 intsmp_bogo(char*buf)
 74 {
 75 int len =0, i;
 76
 77 for(i =0; i < NR_CPUS; i++)
 78 if(cpu_present_map & (1UL<< i))
 79  len +=sprintf(buf + len,
 80 "Cpu%dBogo\t: %lu.%02lu\n",
 81  i, cpu_data[i].udelay_val /500000,
 82 (cpu_data[i].udelay_val /5000) %100);
 83 return len;
 84 }
 85
 86 void __init smp_store_cpu_info(int id)
 87 {
 88 int i;
 89
 90  cpu_data[id].irq_count =0;
 91  cpu_data[id].bh_count =0;
 92 /* multiplier and counter set by
 93  smp_setup_percpu_timer() */
 94  cpu_data[id].udelay_val = loops_per_sec;
 95
 96  cpu_data[id].pgcache_size =0;
 97  cpu_data[id].pte_cache = NULL;
 98  cpu_data[id].pgdcache_size =0;
 99  cpu_data[id].pgd_cache = NULL;
 100  cpu_data[id].idle_volume =1;
 101
 102 for(i =0; i <16; i++)
 103  cpu_data[id].irq_worklists[i] =0;
 104 }
 105
 106 void __init smp_commence(void)
 107 {
 108 }
 109
 110 static voidsmp_setup_percpu_timer(void);
 111 static voidsmp_tune_scheduling(void);
 112
 113 staticvolatileunsigned long callin_flag =0;
 114
 115 externvoidinherit_locked_prom_mappings(int save_p);
 116 externvoidcpu_probe(void);
 117
 118 void __init smp_callin(void)
 119 {
 120 int cpuid =hard_smp_processor_id();
 121
 122 inherit_locked_prom_mappings(0);
 123
 124 __flush_cache_all();
 125 __flush_tlb_all();
 126
 127 cpu_probe();
 128
 129 /* Master did this already, now is the time for us to do it. */
 130  __asm__ __volatile__("
 131  sethi %%hi(0x80000000), %%g1
 132  sllx %%g1, 32, %%g1
 133  rd %%tick, %%g2
 134  add %%g2, 6, %%g2
 135  andn %%g2, %%g1, %%g2
 136  wrpr %%g2, 0, %%tick
 137 ":/* no outputs */
 138 :/* no inputs */
 139 :"g1","g2");
 140
 141 smp_setup_percpu_timer();
 142
 143 __sti();
 144
 145 calibrate_delay();
 146 smp_store_cpu_info(cpuid);
 147  callin_flag =1;
 148  __asm__ __volatile__("membar #Sync\n\t"
 149 "flush %%g6": : :"memory");
 150
 151 /* Clear this or we will die instantly when we
 152  * schedule back to this idler...
 153  */
 154  current->thread.flags &= ~(SPARC_FLAG_NEWCHILD);
 155
 156 /* Attach to the address space of init_task. */
 157 atomic_inc(&init_mm.mm_count);
 158  current->active_mm = &init_mm;
 159
 160 while(!smp_processors_ready)
 161 membar("#LoadLoad");
 162 }
 163
 164 externintcpu_idle(void);
 165 externvoidinit_IRQ(void);
 166
 167 voidinitialize_secondary(void)
 168 {
 169 }
 170
 171 intstart_secondary(void*unused)
 172 {
 173 trap_init();
 174 init_IRQ();
 175 smp_callin();
 176 returncpu_idle();
 177 }
 178
 179 voidcpu_panic(void)
 180 {
 181 printk("CPU[%d]: Returns from cpu_idle!\n",smp_processor_id());
 182 panic("SMP bolixed\n");
 183 }
 184
 185 externstruct prom_cpuinfo linux_cpus[64];
 186
 187 externunsigned long smp_trampoline;
 188
 189 /* The OBP cpu startup callback truncates the 3rd arg cookie to
 190  * 32-bits (I think) so to be safe we have it read the pointer
 191  * contained here so we work on >4GB machines. -DaveM
 192  */
 193 static struct task_struct *cpu_new_task = NULL;
 194
 195 void __init smp_boot_cpus(void)
 196 {
 197 int cpucount =0, i;
 198
 199 printk("Entering UltraSMPenguin Mode...\n");
 200 __sti();
 201 smp_store_cpu_info(boot_cpu_id);
 202 smp_tune_scheduling();
 203 init_idle();
 204
 205 if(linux_num_cpus ==1)
 206 return;
 207
 208 for(i =0; i < NR_CPUS; i++) {
 209 if(i == boot_cpu_id)
 210 continue;
 211
 212 if(cpu_present_map & (1UL<< i)) {
 213 unsigned long entry = (unsigned long)(&smp_trampoline);
 214 unsigned long cookie = (unsigned long)(&cpu_new_task);
 215 struct task_struct *p;
 216 int timeout;
 217 int no;
 218 externunsigned long phys_base;
 219
 220  entry += phys_base - KERNBASE;
 221  cookie += phys_base - KERNBASE;
 222 kernel_thread(start_secondary, NULL, CLONE_PID);
 223  cpucount++;
 224
 225  p = init_task.prev_task;
 226  init_tasks[cpucount] = p;
 227
 228  p->processor = i;
 229  p->has_cpu =1;/* we schedule the first task manually */
 230
 231 del_from_runqueue(p);
 232 unhash_process(p);
 233
 234  callin_flag =0;
 235 for(no =0; no < linux_num_cpus; no++)
 236 if(linux_cpus[no].mid == i)
 237 break;
 238  cpu_new_task = p;
 239 prom_startcpu(linux_cpus[no].prom_node,
 240  entry, cookie);
 241 for(timeout =0; timeout <5000000; timeout++) {
 242 if(callin_flag)
 243 break;
 244 udelay(100);
 245 }
 246 if(callin_flag) {
 247  cpu_number_map[i] = cpucount;
 248  __cpu_logical_map[cpucount] = i;
 249  prom_cpu_nodes[i] = linux_cpus[no].prom_node;
 250 }else{
 251  cpucount--;
 252 printk("Processor %d is stuck.\n", i);
 253 }
 254 }
 255 if(!callin_flag) {
 256  cpu_present_map &= ~(1UL<< i);
 257  cpu_number_map[i] = -1;
 258 }
 259 }
 260  cpu_new_task = NULL;
 261 if(cpucount ==0) {
 262 printk("Error: only one processor found.\n");
 263  cpu_present_map = (1UL<<smp_processor_id());
 264 }else{
 265 unsigned long bogosum =0;
 266
 267 for(i =0; i < NR_CPUS; i++) {
 268 if(cpu_present_map & (1UL<< i))
 269  bogosum += cpu_data[i].udelay_val;
 270 }
 271 printk("Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
 272  cpucount +1,
 273 (bogosum +2500)/500000,
 274 ((bogosum +2500)/5000)%100);
 275  smp_activated =1;
 276  smp_num_cpus = cpucount +1;
 277 }
 278  smp_processors_ready =1;
 279 membar("#StoreStore | #StoreLoad");
 280 }
 281
 282 /* #define XCALL_DEBUG */
 283
 284 staticinlinevoidxcall_deliver(u64 data0, u64 data1, u64 data2, u64 pstate,unsigned long cpu)
 285 {
 286  u64 result, target = (cpu <<14) |0x70;
 287 int stuck, tmp;
 288
 289 #ifdef XCALL_DEBUG
 290 printk("CPU[%d]: xcall(data[%016lx:%016lx:%016lx],tgt[%016lx])\n",
 291 smp_processor_id(), data0, data1, data2, target);
 292 #endif
 293 again:
 294  tmp =0x40;
 295  __asm__ __volatile__("
 296  wrpr %1, %2, %%pstate
 297  stxa %4, [%0] %3
 298  stxa %5, [%0+%8] %3
 299  add %0, %8, %0
 300  stxa %6, [%0+%8] %3
 301  membar #Sync
 302  stxa %%g0, [%7] %3
 303  membar #Sync"
 304 :"=r"(tmp)
 305 :"r"(pstate),"i"(PSTATE_IE),"i"(ASI_UDB_INTR_W),
 306 "r"(data0),"r"(data1),"r"(data2),"r"(target),"r"(0x10),"0"(tmp));
 307
 308 /* NOTE: PSTATE_IE is still clear. */
 309  stuck =100000;
 310 do{
 311  __asm__ __volatile__("ldxa [%%g0] %1, %0"
 312 :"=r"(result)
 313 :"i"(ASI_INTR_DISPATCH_STAT));
 314 if(result ==0) {
 315  __asm__ __volatile__("wrpr %0, 0x0, %%pstate"
 316 : :"r"(pstate));
 317 return;
 318 }
 319  stuck -=1;
 320 if(stuck ==0)
 321 break;
 322 }while(result &0x1);
 323  __asm__ __volatile__("wrpr %0, 0x0, %%pstate"
 324 : :"r"(pstate));
 325 if(stuck ==0) {
 326 #ifdef XCALL_DEBUG
 327 printk("CPU[%d]: mondo stuckage result[%016lx]\n",
 328 smp_processor_id(), result);
 329 #endif
 330 }else{
 331 #ifdef XCALL_DEBUG
 332 printk("CPU[%d]: Penguin %d NACK's master.\n",smp_processor_id(), cpu);
 333 #endif
 334 udelay(2);
 335 goto again;
 336 }
 337 }
 338
 339 voidsmp_cross_call(unsigned long*func, u32 ctx, u64 data1, u64 data2)
 340 {
 341 if(smp_processors_ready) {
 342 unsigned long mask = (cpu_present_map & ~(1UL<<smp_processor_id()));
 343  u64 pstate, data0 = (((u64)ctx)<<32| (((u64)func) &0xffffffff));
 344 int i, ncpus = smp_num_cpus -1;
 345
 346  __asm__ __volatile__("rdpr %%pstate, %0":"=r"(pstate));
 347 for(i =0; i < NR_CPUS; i++) {
 348 if(mask & (1UL<< i)) {
 349 xcall_deliver(data0, data1, data2, pstate, i);
 350  ncpus--;
 351 }
 352 if(!ncpus)break;
 353 }
 354 /* NOTE: Caller runs local copy on master. */
 355 }
 356 }
 357
 358 externunsigned long xcall_flush_tlb_page;
 359 externunsigned long xcall_flush_tlb_mm;
 360 externunsigned long xcall_flush_tlb_range;
 361 externunsigned long xcall_flush_tlb_all;
 362 externunsigned long xcall_tlbcachesync;
 363 externunsigned long xcall_flush_cache_all;
 364 externunsigned long xcall_report_regs;
 365 externunsigned long xcall_receive_signal;
 366
 367 voidsmp_receive_signal(int cpu)
 368 {
 369 if(smp_processors_ready &&
 370 (cpu_present_map & (1UL<<cpu)) !=0) {
 371  u64 pstate, data0 = (((u64)&xcall_receive_signal) &0xffffffff);
 372  __asm__ __volatile__("rdpr %%pstate, %0":"=r"(pstate));
 373 xcall_deliver(data0,0,0, pstate, cpu);
 374 }
 375 }
 376
 377 voidsmp_report_regs(void)
 378 {
 379 smp_cross_call(&xcall_report_regs,0,0,0);
 380 }
 381
 382 voidsmp_flush_cache_all(void)
 383 {
 384 smp_cross_call(&xcall_flush_cache_all,0,0,0);
 385 __flush_cache_all();
 386 }
 387
 388 voidsmp_flush_tlb_all(void)
 389 {
 390 smp_cross_call(&xcall_flush_tlb_all,0,0,0);
 391 __flush_tlb_all();
 392 }
 393
 394 /* We know that the window frames of the user have been flushed
 395  * to the stack before we get here because all callers of us
 396  * are flush_tlb_*() routines, and these run after flush_cache_*()
 397  * which performs the flushw.
 398  *
 399  * XXX I diked out the fancy flush avoidance code for the
 400  * XXX swapping cases for now until the new MM code stabilizes. -DaveM
 401  *
 402  * The SMP TLB coherency scheme we use works as follows:
 403  *
 404  * 1) mm->cpu_vm_mask is a bit mask of which cpus an address
 405  * space has (potentially) executed on, this is the heuristic
 406  * we use to avoid doing cross calls.
 407  *
 408  * 2) TLB context numbers are shared globally across all processors
 409  * in the system, this allows us to play several games to avoid
 410  * cross calls.
 411  *
 412  * One invariant is that when a cpu switches to a process, and
 413  * that processes tsk->active_mm->cpu_vm_mask does not have the
 414  * current cpu's bit set, that tlb context is flushed locally.
 415  *
 416  * If the address space is non-shared (ie. mm->count == 1) we avoid
 417  * cross calls when we want to flush the currently running process's
 418  * tlb state. This is done by clearing all cpu bits except the current
 419  * processor's in current->active_mm->cpu_vm_mask and performing the
 420  * flush locally only. This will force any subsequent cpus which run
 421  * this task to flush the context from the local tlb if the process
 422  * migrates to another cpu (again).
 423  *
 424  * 3) For shared address spaces (threads) and swapping we bite the
 425  * bullet for most cases and perform the cross call.
 426  *
 427  * The performance gain from "optimizing" away the cross call for threads is
 428  * questionable (in theory the big win for threads is the massive sharing of
 429  * address space state across processors).
 430  *
 431  * For the swapping case the locking is difficult to get right, we'd have to
 432  * enforce strict ordered access to mm->cpu_vm_mask via a spinlock for example.
 433  * Then again one could argue that when you are swapping, the cost of a cross
 434  * call won't even show up on the performance radar. But in any case we do get
 435  * rid of the cross-call when the task has a dead context or the task has only
 436  * ever run on the local cpu.
 437  */
 438 voidsmp_flush_tlb_mm(struct mm_struct *mm)
 439 {
 440  u32 ctx =CTX_HWBITS(mm->context);
 441
 442 if(mm == current->active_mm &&
 443 atomic_read(&mm->mm_users) ==1&&
 444 (mm->cpu_vm_mask == (1UL<<smp_processor_id())))
 445 goto local_flush_and_out;
 446
 447 smp_cross_call(&xcall_flush_tlb_mm, ctx,0,0);
 448
 449 local_flush_and_out:
 450 __flush_tlb_mm(ctx, SECONDARY_CONTEXT);
 451 }
 452
 453 voidsmp_flush_tlb_range(struct mm_struct *mm,unsigned long start,
 454 unsigned long end)
 455 {
 456  u32 ctx =CTX_HWBITS(mm->context);
 457
 458  start &= PAGE_MASK;
 459  end &= PAGE_MASK;
 460 if(mm == current->active_mm &&
 461 atomic_read(&mm->mm_users) ==1&&
 462 (mm->cpu_vm_mask == (1UL<<smp_processor_id())))
 463 goto local_flush_and_out;
 464
 465 smp_cross_call(&xcall_flush_tlb_range, ctx, start, end);
 466
 467 local_flush_and_out:
 468 __flush_tlb_range(ctx, start, SECONDARY_CONTEXT, end, PAGE_SIZE, (end-start));
 469 }
 470
 471 voidsmp_flush_tlb_page(struct mm_struct *mm,unsigned long page)
 472 {
 473  u32 ctx =CTX_HWBITS(mm->context);
 474
 475  page &= PAGE_MASK;
 476 if(mm == current->active_mm &&
 477 atomic_read(&mm->mm_users) ==1&&
 478 (mm->cpu_vm_mask == (1UL<<smp_processor_id()))) {
 479 goto local_flush_and_out;
 480 }
 481
 482 smp_cross_call(&xcall_flush_tlb_page, ctx, page,0);
 483
 484 local_flush_and_out:
 485 __flush_tlb_page(ctx, page, SECONDARY_CONTEXT);
 486 }
 487
 488 /* CPU capture. */
 489 /* #define CAPTURE_DEBUG */
 490 externunsigned long xcall_capture;
 491
 492 static atomic_t smp_capture_depth =ATOMIC_INIT(0);
 493 static atomic_t smp_capture_registry =ATOMIC_INIT(0);
 494 static unsigned long penguins_are_doing_time =0;
 495
 496 voidsmp_capture(void)
 497 {
 498 if(smp_processors_ready) {
 499 int result =atomic_add_return(1, &smp_capture_depth);
 500
 501 membar("#StoreStore | #LoadStore");
 502 if(result ==1) {
 503 int ncpus = smp_num_cpus;
 504
 505 #ifdef CAPTURE_DEBUG
 506 printk("CPU[%d]: Sending penguins to jail...",
 507 smp_processor_id());
 508 #endif
 509  penguins_are_doing_time =1;
 510 membar("#StoreStore | #LoadStore");
 511 atomic_inc(&smp_capture_registry);
 512 smp_cross_call(&xcall_capture,0,0,0);
 513 while(atomic_read(&smp_capture_registry) != ncpus)
 514 membar("#LoadLoad");
 515 #ifdef CAPTURE_DEBUG
 516 printk("done\n");
 517 #endif
 518 }
 519 }
 520 }
 521
 522 voidsmp_release(void)
 523 {
 524 if(smp_processors_ready) {
 525 if(atomic_dec_and_test(&smp_capture_depth)) {
 526 #ifdef CAPTURE_DEBUG
 527 printk("CPU[%d]: Giving pardon to imprisoned penguins\n",
 528 smp_processor_id());
 529 #endif
 530  penguins_are_doing_time =0;
 531 membar("#StoreStore | #StoreLoad");
 532 atomic_dec(&smp_capture_registry);
 533 }
 534 }
 535 }
 536
 537 /* Imprisoned penguins run with %pil == 15, but PSTATE_IE set, so they
 538  * can service tlb flush xcalls...
 539  */
 540 voidsmp_penguin_jailcell(void)
 541 {
 542 flushw_user();
 543 atomic_inc(&smp_capture_registry);
 544 membar("#StoreLoad | #StoreStore");
 545 while(penguins_are_doing_time)
 546 membar("#LoadLoad");
 547 atomic_dec(&smp_capture_registry);
 548 }
 549
 550 staticinlinevoidsparc64_do_profile(unsigned long pc,unsigned long g3)
 551 {
 552 if(prof_buffer && current->pid) {
 553 externint _stext;
 554 externint rwlock_impl_begin, rwlock_impl_end;
 555 externint atomic_impl_begin, atomic_impl_end;
 556
 557 if((pc >= (unsigned long) &rwlock_impl_begin &&
 558  pc < (unsigned long) &rwlock_impl_end) ||
 559 (pc >= (unsigned long) &atomic_impl_begin &&
 560  pc < (unsigned long) &atomic_impl_end))
 561  pc = g3;
 562
 563  pc -= (unsigned long) &_stext;
 564  pc >>= prof_shift;
 565
 566 if(pc >= prof_len)
 567  pc = prof_len -1;
 568 atomic_inc((atomic_t *)&prof_buffer[pc]);
 569 }
 570 }
 571
 572 static unsigned long current_tick_offset;
 573
 574 #define prof_multiplier(__cpu) cpu_data[(__cpu)].multiplier
 575 #define prof_counter(__cpu) cpu_data[(__cpu)].counter
 576
 577 externvoidupdate_one_process(struct task_struct *p,unsigned long ticks,
 578 unsigned long user,unsigned long system,
 579 int cpu);
 580
 581 voidsmp_percpu_timer_interrupt(struct pt_regs *regs)
 582 {
 583 unsigned long compare, tick;
 584 int cpu =smp_processor_id();
 585 int user =user_mode(regs);
 586
 587 /*
 588  * Check for level 14 softint.
 589  */
 590 if(!(get_softint() & (1UL<<0))) {
 591 externvoidhandler_irq(int,struct pt_regs *);
 592
 593 handler_irq(14, regs);
 594 return;
 595 }
 596
 597 clear_softint((1UL<<0));
 598 do{
 599 if(!user)
 600 sparc64_do_profile(regs->tpc, regs->u_regs[UREG_G3]);
 601 if(!--prof_counter(cpu))
 602 {
 603 if(cpu == boot_cpu_id) {
 604 /* XXX Keep this in sync with irq.c --DaveM */
 605 #define irq_enter(cpu, irq) \
 606 do { hardirq_enter(cpu); \
 607  spin_unlock_wait(&global_irq_lock); \
 608 } while(0)
 609 #define irq_exit(cpu, irq) hardirq_exit(cpu)
 610
 611 irq_enter(cpu,0);
 612  kstat.irqs[cpu][0]++;
 613
 614 timer_tick_interrupt(regs);
 615
 616 irq_exit(cpu,0);
 617
 618 #undef irq_enter
 619 #undef irq_exit
 620 }
 621
 622 if(current->pid) {
 623 unsigned int*inc, *inc2;
 624
 625 update_one_process(current,1, user, !user, cpu);
 626 if(--current->counter <=0) {
 627  current->counter =0;
 628  current->need_resched =1;
 629 }
 630
 631 if(user) {
 632 if(current->priority < DEF_PRIORITY) {
 633  inc = &kstat.cpu_nice;
 634  inc2 = &kstat.per_cpu_nice[cpu];
 635 }else{
 636  inc = &kstat.cpu_user;
 637  inc2 = &kstat.per_cpu_user[cpu];
 638 }
 639 }else{
 640  inc = &kstat.cpu_system;
 641  inc2 = &kstat.per_cpu_system[cpu];
 642 }
 643 atomic_inc((atomic_t *)inc);
 644 atomic_inc((atomic_t *)inc2);
 645 }
 646 prof_counter(cpu) =prof_multiplier(cpu);
 647 }
 648
 649  __asm__ __volatile__("rd %%tick_cmpr, %0\n\t"
 650 "add %0, %2, %0\n\t"
 651 "wr %0, 0x0, %%tick_cmpr\n\t"
 652 "rd %%tick, %1"
 653 :"=&r"(compare),"=r"(tick)
 654 :"r"(current_tick_offset));
 655 }while(tick >= compare);
 656 }
 657
 658 static void __init smp_setup_percpu_timer(void)
 659 {
 660 int cpu =smp_processor_id();
 661
 662 prof_counter(cpu) =prof_multiplier(cpu) =1;
 663
 664  __asm__ __volatile__("rd %%tick, %%g1\n\t"
 665 "add %%g1, %0, %%g1\n\t"
 666 "wr %%g1, 0x0, %%tick_cmpr"
 667 :/* no outputs */
 668 :"r"(current_tick_offset)
 669 :"g1");
 670 }
 671
 672 void __init smp_tick_init(void)
 673 {
 674 int i;
 675
 676  boot_cpu_id =hard_smp_processor_id();
 677  current_tick_offset = timer_tick_offset;
 678  cpu_present_map =0;
 679 for(i =0; i < linux_num_cpus; i++)
 680  cpu_present_map |= (1UL<< linux_cpus[i].mid);
 681 for(i =0; i < NR_CPUS; i++) {
 682  cpu_number_map[i] = -1;
 683  __cpu_logical_map[i] = -1;
 684 }
 685  cpu_number_map[boot_cpu_id] =0;
 686  prom_cpu_nodes[boot_cpu_id] = linux_cpus[0].prom_node;
 687  __cpu_logical_map[0] = boot_cpu_id;
 688  current->processor = boot_cpu_id;
 689 prof_counter(boot_cpu_id) =prof_multiplier(boot_cpu_id) =1;
 690 }
 691
 692 staticinlineunsigned longfind_flush_base(unsigned long size)
 693 {
 694 struct page *p = mem_map;
 695 unsigned long found, base;
 696
 697  size =PAGE_ALIGN(size);
 698  found = size;
 699  base =page_address(p);
 700 while(found !=0) {
 701 /* Failure. */
 702 if(p >= (mem_map + max_mapnr))
 703 return0UL;
 704 if(PageSkip(p)) {
 705  p = p->next_hash;
 706  base =page_address(p);
 707  found = size;
 708 }else{
 709  found -= PAGE_SIZE;
 710  p++;
 711 }
 712 }
 713 return base;
 714 }
 715
 716 cycles_t cacheflush_time;
 717
 718 static void __init smp_tune_scheduling(void)
 719 {
 720 unsigned long flush_base, flags, *p;
 721 unsigned int ecache_size;
 722  cycles_t tick1, tick2, raw;
 723
 724 /* Approximate heuristic for SMP scheduling. It is an
 725  * estimation of the time it takes to flush the L2 cache
 726  * on the local processor.
 727  *
 728  * The ia32 chooses to use the L1 cache flush time instead,
 729  * and I consider this complete nonsense. The Ultra can service
 730  * a miss to the L1 with a hit to the L2 in 7 or 8 cycles, and
 731  * L2 misses are what create extra bus traffic (ie. the "cost"
 732  * of moving a process from one cpu to another).
 733  */
 734 printk("SMP: Calibrating ecache flush... ");
 735  ecache_size =prom_getintdefault(linux_cpus[0].prom_node,
 736 "ecache-size", (512*1024));
 737  flush_base =find_flush_base(ecache_size <<1);
 738
 739 if(flush_base !=0UL) {
 740 __save_and_cli(flags);
 741
 742 /* Scan twice the size once just to get the TLB entries
 743  * loaded and make sure the second scan measures pure misses.
 744  */
 745 for(p = (unsigned long*)flush_base;
 746 ((unsigned long)p) < (flush_base + (ecache_size<<1));
 747  p += (64/sizeof(unsigned long)))
 748 *((volatileunsigned long*)p);
 749
 750 /* Now the real measurement. */
 751  __asm__ __volatile__("
 752  b,pt %%xcc, 1f
 753  rd %%tick, %0
 754
 755  .align 64
 756 1: ldx [%2 + 0x000], %%g1
 757  ldx [%2 + 0x040], %%g2
 758  ldx [%2 + 0x080], %%g3
 759  ldx [%2 + 0x0c0], %%g5
 760  add %2, 0x100, %2
 761  cmp %2, %4
 762  bne,pt %%xcc, 1b
 763  nop
 764
 765  rd %%tick, %1"
 766 :"=&r"(tick1),"=&r"(tick2),"=&r"(flush_base)
 767 :"2"(flush_base),"r"(flush_base + ecache_size)
 768 :"g1","g2","g3","g5");
 769
 770 __restore_flags(flags);
 771
 772  raw = (tick2 - tick1);
 773
 774 /* Dampen it a little, considering two processes
 775  * sharing the cache and fitting.
 776  */
 777  cacheflush_time = (raw - (raw >>2));
 778 }else
 779  cacheflush_time = ((ecache_size <<2) +
 780 (ecache_size <<1));
 781
 782 printk("Using heuristic of %d cycles.\n",
 783 (int) cacheflush_time);
 784 }
 785
 786 /* /proc/profile writes can call this, don't __init it please. */
 787 intsetup_profiling_timer(unsigned int multiplier)
 788 {
 789 unsigned long flags;
 790 int i;
 791
 792 if((!multiplier) || (timer_tick_offset / multiplier) <1000)
 793 return-EINVAL;
 794
 795 save_and_cli(flags);
 796 for(i =0; i < NR_CPUS; i++) {
 797 if(cpu_present_map & (1UL<< i))
 798 prof_multiplier(i) = multiplier;
 799 }
 800  current_tick_offset = (timer_tick_offset / multiplier);
 801 restore_flags(flags);
 802
 803 return0;
 804 }