#include <l4/types.h>
#include <flux/machine/pc/rtc.h>
#include <flux/machine/eflags.h>
#include <flux/machine/base_trap.h>

#include <string.h>
#include <stdio.h>

#include "config.h"
#include "globals.h"
#include "kdb.h"
#include "time.h"
#include "ipc.h"

#include "irq.h"
#include "thread.h"
#include "thread_list.h"

//
// class thread_t
//

thread_t *thread_t::prio_first[256];
thread_t *thread_t::prio_next[256];
unsigned thread_t::prio_highest;

int (*thread_t::nested_trap_handler)(trap_state *state) = 0;

int timeslice_ticks_left = config::default_time_slice; 

// allocator
// operator new() in thread.h

// deallocator
void 
thread_t::operator delete(void *)
{
  // XXX should check if all thread blocks on a given page are free
  // and deallocate (or mark free) the page if so.  this should be
  // easy to detect since normally all threads of a given task are
  // destroyed at once at task deletion time
}

// constructor

// the constructor doesn't really initialize the data members of of
// the thread control block (tcb) ; instead, it sets up the the tcb
// just enough so that the thread can be sent signals
thread_t::thread_t()
{
  // being able to receive signals involves: (a) push some restart
  // address on stack (we use a do-nothing routine which just always
  // schedules); (b) setting up the thread's the kernel stack pointer

  unsigned long *init_sp = static_cast<unsigned long*>
    (static_cast<unsigned long>(this) + size - sizeof(thread_regs_t));

  // set a magic value -- we use it later to verify the stack hasn't
  // been overrun
  _magic = magic;

  // don't care about errors: they just mean someone else has already
  // set up the tcb
  compare_and_swap(--init_sp, 0UL, static_cast<unsigned long>(user_invoke));
  compare_and_swap(static_cast<unsigned long*>(&kernel_sp),
		   0UL, static_cast<unsigned long>(init_sp));

  // ok, we're ready to go!
}

thread_t::~thread_t()
{
  assert(state() == Thread_dead);

  assert(! my_signals.first());
  assert(_magic == magic);

  unsigned long *init_sp = static_cast<unsigned long*>
    (static_cast<unsigned long>(this) + size - sizeof(thread_regs_t));

  switching_in.clear();

  kernel_sp = 0;
  *--init_sp = 0;
  my_state = Thread_invalid;
}

// public services

// the scheduler
void 
thread_t::schedule()
{
  // Careful!  At this point, we may (due to a context switch) be
  // dequeued from the ready list but then set ready again, so
  // that our "ready_next" pointer becomes invalid
  cli();
  if (in_ready_list())
    {
      if (! (my_state & Thread_running))
	ready_dequeue();
      else
	prio_next[my_prio] = ready_next->my_prio == my_prio ? 
	  ready_next : prio_first[my_prio];
    }
  else    
    {
      // we're not in the ready list.  if we're ready nontheless,
      // enqueue now so that we can be scheduled correctly

      if (my_state & Thread_running)
	ready_enqueue();
    }
  sti();

  thread_t *next_to_run;

  for (;;)
    {
      next_to_run = prio_next[prio_highest];

      if (! next_to_run) 
	{
	  if (prio_highest == 0)
	    {
	      // XXX can this happen?
	      next_to_run = kernel_thread;
	      break;
	    }

	  continue;		// try again -- someone has modified the queue
	}

      if (next_to_run->my_state & Thread_running) 
	break;

      next_to_run->ready_dequeue();
    }
	
  timeslice_ticks_left = next_to_run->my_timeslice;

  if (next_to_run != this) switch_to(next_to_run);
}

//
// private functions 
//

bool 
thread_t::do_signals()
{
  // make sure state-switch handling and all signals are serialized
  if (switching_in.test_and_set())
    return false;

  for (;;)
    {
      // check for signals
      for (signal_t *s = my_signals.dequeue(); 
	   s; 
	   s = my_signals.dequeue())
	{
	  thread_t *sender = s->sender();
	  s->set_ret(s->func(this)); // call signal handler
	  
	  s = my_signals.first(); // next signal?
	  
	  if (s 		// there is another signal and it
	      && sender->my_prio < s->sender()->my_prio) // has higher prio
	    continue;		// then handle next signal immediately
	    
	  // switch back so sender of signal if we're not ready to run
	  // and higher priorized
	  if (s			// more signals left -- not ready to run
	      || sender->my_prio > my_prio
	      || (! (state() & Thread_running)))
	    {
	      // then switch to sender of handled signal
	      if (this != sender) switch_to(sender);
	    }
	}
      
      while ((! my_signals.first())
	     && (! (state() & Thread_running)))
	{
	  schedule();
	}

      switching_in.clear();

      // repeat if more signals arrived

      if (! my_signals.first())
	break;			
  
      check (switching_in.test_and_set() == 0);
    }

  return true;
}


// send a signal to a thread
unsigned long 
thread_t::send_signal(thread_t *to, signal_t *s)
{
  assert(this == current());
  assert((state() & Thread_running) || switching_in.test());

  // if this is the first signal we're sending (i.e., we're not
  // sending a signal from a signal handler), register the signal

  bool first_signal = state_add(Thread_signalling);

  if (first_signal)
    {
      // save state so that we can be deleted safely
      my_signal_partner = to; // save the partner we send signal to
      my_sent_signal = s;
    }

  // insert signal structure into target's signal list
  if (! to->insert_signal(s))
    {
      if (first_signal)
	{
	  my_sent_signal = 0;
	  state_del(Thread_signalling);
	}
      s->set_ret(signal_t::Sig_aborted);
      return signal_t::Sig_aborted;
    }

  // execute the signal handler in target's context
  while (! s->done()) 
    {
      if (to == this)
	{
	  // do signals locally
	  if (! switchin_context())
	    panic("recursive signals");	// we interrupted a signal handler
	}
      else
	{
	  if (! switch_to(to))
	    {
	      s->set_ret(signal_t::Sig_aborted);
	      break;
	    }
	}
    }

  // done -- resume execution as normal
  if (first_signal)
    {
      my_sent_signal = 0;
      state_del(Thread_signalling);
    }
  return s->ret();
}

// thread context switchin -- called on every re-activation of a thread
bool
thread_t::switchin_context()
{
  assert(this == current());

  if (! do_signals())		// could we deliver signals (sig q lockable?)
    {				// no -- signal processing already in progress
      assert(switching_in.test());

      return false;
    }

  // only need to do more switching work if we were able to lock the
  // signal queue

#if 0  
  assert(state() & Thread_running); // XXX actually, this can happen;
				    // we should schedule...
#endif

  // set kernel-esp in case we want to return to the user
  *(kmem::kernel_esp()) = static_cast<vm_offset_t>(regs() + 1);
  
  // switch to our page directory if nessecary
  my_space->switchin_context();

  return true;
}

// switch to a specific different thread and immediately handle its signals
bool 
thread_t::switch_to(thread_t *t)
{
  assert(current() != t);
  assert(this == current());

  // make sure stack has enough space
  assert(t->kernel_sp == 0 
	 || reinterpret_cast<vm_offset_t>(t->kernel_sp)
	    > reinterpret_cast<vm_offset_t>(t) + sizeof(thread_t) + 0x20);

  if ((state() & Thread_running) && ! in_ready_list())
    ready_enqueue();
    
  // switch to new thread's stack

  bool ret;

  // caution: be sure not to use any registers in here so that the
  // thread switching code remains preemptible as well
  asm volatile
    (// save context of old thread
#ifndef NO_FRAME_PTR
     "   pushl %%ebp \n"
#endif
     "   pushl $1f \n"		// push restart address on old stack

     "   cli \n"		// XXX is this really necessary?
     "   movl  %%esp, (%1) \n"	// save stack pointer

     // read context of new thread
     "   movl  (%2), %%esp \n"	// load new stack pointer - now in other thread
     "   testl %%esp,%%esp \n"	// check new stack pointer - is it 0?
     "   jz    3f \n"		// yes - fail
     "   sti \n"		// XXX

     // deliver signals to new thread
     "   pushl %%esp \n"	// pass ptr to context as argument
     "   pushl %3 \n"		// new thread's "this" -- XXX correct?
     "   call  %4 \n"		// address of switchin_context()
     "   addl  $8,%%esp \n"	// clean up args

     // return to old context
     "   ret \n"		// jump to restart address on stack

     // code for failure
     "3: \n"
     "   movl  (%1), %%esp \n"	// load old stack pointer
     "   sti \n"
     "   xor   %0,%0 \n"	// return false
     "   jmp   2f \n"

     // label for our restart address
     "   .p2align 4 \n"		// start code at new cache line
     "1: mov   $1,%0 \n"	// return true
     "2: \n"			// label for end of this asm segment
#ifndef NO_FRAME_PTR
     "   popl %%ebp \n"
#endif
     : "=r" (ret)
     : "r" (&kernel_sp), "r" (&t->kernel_sp), "r" (t), "m" (switchin_context)
     : "eax", "ebx", "ecx", "edx", "esi", "edi", 
#ifdef NO_FRAME_PTR
       "ebp", 
#endif
       "memory");

  return ret;
}

//
// internal stuff -- all of these routines are serialized and run on
// the stack of the thread to be manipulated, so we don't need to
// synchronize access to private members (except for the kernel stack
// pointer)
//

void 
thread_t::do_kill()
{
  panic("thread_t::do_kill called");
}

unsigned long 
signal_reset_t::func(thread_t *t)
{
  if (t->state() != Thread_invalid)
    return Sig_fail;

  // make sure we're freshly initialized
  assert(*static_cast<unsigned long*>(static_cast<unsigned long>(t) 
				      + thread_t::size 
				      - sizeof(thread_regs_t) - 4)
	 == static_cast<unsigned long>(thread_t::user_invoke));

  t->my_space = my_space;
  t->my_id = *my_id;
  t->my_prio = my_init_prio;
  t->my_mcp = my_mcp;
  t->my_irq = 0;
  t->my_idt_limit = 0;

  t->my_timeslice = config::default_time_slice;

  // make sure the thread's kernel stack is mapped in its address space
  my_space->kmem_update(reinterpret_cast<vm_offset_t>(t));

  t->state_add(Thread_dead);

  t->my_pager = t->my_preempter = t->my_ext_preempter = nil_thread;

  if (t->space_index() == sender()->space_index())
    t->present_enqueue(sender()); // same task -> enqueue after creator
  else				// other task -> enqueue in front of this task
    t->present_enqueue(sender()->space_index().lookup_thread()->present_prev);
    // that's safe because thread 0 of a task is always present

  return Sig_success;
}

unsigned long 
signal_init_t::func(thread_t *t)
{
  if (t->state() == Thread_invalid)
    return Sig_fail;

  thread_ret_regs_t *regs = t->regs();

  if (my_o_pager) *my_o_pager = t->my_pager;
  if (my_o_preempter) *my_o_preempter = t->my_preempter;
  if (my_o_esp) *my_o_esp = regs->esp;
  if (my_o_eip) *my_o_eip = regs->eip;
  if (my_o_eflags) *my_o_eflags = regs->eflags;

  if (my_eip != 0xffffffff)
    {
      regs->eip = my_eip;
      if (! (t->state() & Thread_dead))
	{
#if 0
	  kdb::ke("reseting non-dead thread");
#endif
	  // cancel ongoing IPC or other activity
	  t->state_change(~(Thread_waiting|Thread_receiving|Thread_polling),
			  Thread_cancel);
	}
    }

  if (my_pager != 0) t->my_pager = my_pager;
  if (my_preempter != 0) t->my_preempter = my_preempter;
  regs->cs = kmem::gdt_code_user | SEL_PL_U;
  regs->eflags = EFL_IOPL_USER | EFL_IF | 2;	// XXX iopl=kernel, ei
  if (my_esp != 0xffffffff) regs->esp = my_esp;
  regs->ss = kmem::gdt_data_user | SEL_PL_U;
  
  t->state_change(~Thread_dead, Thread_running);
  // need not enqueue in ready queue here -- switch_to() will do this for us
  
  return Sig_success;
}

unsigned long
signal_schedule_t::func(thread_t *t)
{
  if (t->state() == Thread_invalid)
    return Sig_fail;

  // call legal?
  if (t->my_prio > sender()->my_mcp)
    return Sig_fail;

  // query current state
  unsigned s = t->state();

  if (s & (Thread_polling|Thread_receiving))
    *my_partner = t->my_partner;
  else
    *my_partner = 0;

  if (s & Thread_dead) s = 0xf;
  else if (s & Thread_polling) s = (s & Thread_running) ? 8 : 0xd;
  else if (s & (Thread_waiting|Thread_receiving)) 
    s = (s & Thread_running) ? 8 : 0xc;
  else s = 0;

  my_o_param->sp.zero = s;

  unsigned long long timeslice = t->my_timeslice * config::microsec_per_tick;
  unsigned exp = 15;

  while (timeslice > 255)
    {
      timeslice >>= 2;
      exp--;
    }

  my_o_param->sp.time_exp = exp;
  my_o_param->sp.time_man = timeslice;

  my_o_param->sp.prio = t->my_prio;

  my_o_param->sp.small = 0;	// unsupported

  *my_o_ext_preempter = t->my_ext_preempter;

  my_time->low = my_time->high = 0; // XXX unsupported

  // set new state
  if (my_param.sched_param != 0xffffffff)
    {
      if (my_param.sp.prio <= sender()->my_mcp)
	if (t->my_prio != my_param.sp.prio)
	  {
	    // We need to protect the priority manipulation so that
	    // this thread can not be preempted an ready-enqueued
	    // according to a wrong priority
	    cli();
	    if (t->my_state & Thread_running)
	      t->ready_dequeue(); // need to re-queue in ready queue
				// according to new prio
	    t->my_prio = my_param.sp.prio;
	    sti();
	  }
      
      if (my_param.sp.time_exp)
	{
	  if (my_param.sp.time_man)
	    {
	      timeslice = static_cast<unsigned long long>(my_param.sp.time_man)
		<< ((15 - my_param.sp.time_exp) << 1);
	      t->my_timeslice = timeslice / config::microsec_per_tick;

	      if (t->my_timeslice == 0)
		t->my_timeslice = 1;
	    }
	  else
	    t->my_timeslice = 0;
	}
    }

  if (my_ext_preempter)
    {
      if (my_ext_preempter->exists())
	t->my_ext_preempter = my_ext_preempter;
      else
	kdb::ke("sig_sched: invalid ext_preempter");
    }

  return Sig_success;
}

unsigned long
signal_exception_t::func(thread_t *t)
{
  if (! (t->state() & Thread_running)
      || (t->state() & Thread_cancel))
    return Sig_fail;

  _ts->eip = _handler;
  _ts->esp -= ((_ts->trapno >= 0x0a && _ts->trapno <= 0x0e)
	       || _ts->trapno == 0x08
	       || _ts->trapno == 0x11) // need to push error code?
    ? 4 * 4
    : 3 * 4;

  return Sig_success;
}

unsigned long
signal_kill_t::func(thread_t *t)
{
  if (t->state() == Thread_invalid
      || (t->state() & Thread_dead))
    return Sig_fail;

  assert(t->in_present_list());
  
  // If sending another signal, finish it before killing this thread
  if ((t->state() & (Thread_signalling|Thread_running))
      && t->my_sent_signal)
    {
      assert(t->my_signal_partner != t);

      while (! t->my_sent_signal->done())
	{
	  if (! t->switch_to(t->my_signal_partner))
	    break;		// could not switch into sig rcvr's context
	}
    }

  // If we're thread 0, start by deleting all our threads.

  if (t->id().id.lthread == 0)
    {
      // Kill threads belonging to the same task we find in the
      // present list.  We assume here that new threads of the same
      // task are always enqueued /after/ thread 0.
      while (t->present_next != t
	     && t->present_next->space() == t->space())
	{
	  // save in var because present_next will be dequeued from
	  // present queue
	  thread_t *n = t->present_next;
	  if (! n->kill())
	    return Sig_fail;
	  
	  delete n;
	}
      
      // Now that our brothers can't create new tasks any more, kill
      // subtasks.  We assume here that subtasks of a task are always
      // enqueued /before/ thread 0 of a task.
      
      while (t->present_prev != t
	     && t->present_prev->chief_index() == t->space_index())
	{
	  // save in var because present_prev will be dequeued from
	  // present queue
	  thread_t *n = t->present_prev->space_index().lookup_thread();
				// this yields thread 0 of that task
	  if (!n->kill())
	    return Sig_fail;
	  
	  delete n;
	}      

      //
      // Flush our address space
      //

      // It might be faster to flush our address space before actually
      // deleting the subtasks: Under the assumption that child tasks
      // got many of their mappings from this task, flushing mappings
      // early will flush their mappings as well and speed up their
      // deletion.  However, this would yield increased uglyness
      // because subtasks could still be scheduled and do page faults.
      // XXX check /how/ ugly...

      fpage_unmap(t->space(), l4_fpage(0, L4_WHOLE_ADDRESS_SPACE, 0, 0),
		  true, false);
    }

  //
  // Kill this thread
  //

  t->state_change(0, Thread_dead);

  // if attached to irqs, detach
  if (t->my_irq)
    t->my_irq->free(t);

  // if timeout active, abort
  if (t->my_timeout)
    t->my_timeout->reset();

  // if other threads are sending me signals, abort those
  for (signal_t *s = t->my_signals.dequeue(); 
       s; 
       s = t->my_signals.dequeue())
    {
      s->set_ret(Sig_aborted);
    }

  // XXX what happens with signals that are enqueued from now on?

  // if other threads want to send me IPC messages, abort these
  // operations 

  // XXX that's quite difficult: how to traverse a list that may
  // change at all times?

  // if engaged in IPC operation, stop it
  if (t->in_sender_list())
    t->sender_dequeue(t->my_send_partner);
  
  // dequeue from system queues
  assert(t->in_present_list());
  t->present_dequeue();

  // XXX (don't need to dequeue from ready list -- this will happen
  // automagically when we're descheduled)
  if (t->in_ready_list())
    t->ready_dequeue();

  return Sig_success;
}

unsigned long
signal_kill_task_t::func(thread_t *t)
{
  if (t->state() == Thread_invalid
      || (t->state() & Thread_dead))
    return Sig_fail;
  
  assert(t->id().id.lthread == 0);

  space_index_t n(_taskno);

  assert(n.chief() == t->space_index());

  thread_t *c = n.lookup_thread();

  if (! c->kill())
    return Sig_fail;

  delete c;

  return Sig_success;
}



// helpers

void 
thread_t::user_invoke()
{
  while (! (current()->state() & Thread_running))
    current()->schedule();

  asm volatile
    ("  movl %%eax,%%esp \n"	// set stack pointer to regs structure
     "  movw %1,%%ax \n"
     "  movw %%ax,%%es \n"
     "  movw %%ax,%%ds \n"
     "  xorl %%ecx,%%ecx\n"	// clean out user regs
     "  xorl %%edx,%%edx\n"
     "  xorl %%esi,%%esi\n"
     "  xorl %%edi,%%edi\n"
     "  xorl %%ebx,%%ebx\n"
     "  xorl %%ebp,%%ebp\n"
     "  xorl %%eax,%%eax\n"
     "  iret"
     :				// no output
     : "a" (current()->regs()), 
       "i" (kmem::gdt_data_user | SEL_PL_U)
     );

  // never returns here
}

// the global trap handler switch
inline bool
thread_t::handle_slow_trap(trap_state *ts)
{
  extern unsigned gdb_trap_recover; // in OSKit's gdb_trap.c
  unsigned eip;

  if (gdb_trap_recover)
    goto generic; // we're in the GDB stub -- let generic handler handle it

  if (! ((ts->cs & 3) || (ts->eflags & EFL_VM)))
    goto generic;		// we were in kernel mode -- nothing to emulate

  if (ts->trapno == 2)		// NMI?
    goto generic;		// NMI always enters kernel debugger

  if (ts->trapno == 0xffffffff)	// debugger interrupt
    goto generic;

  // user mode fault.  sanity checking...
#if 1	// XXX added only for a special bug
  if (this == kernel_thread)	// user mode fault in idle thread - holy cow!
    {
      cli();
      printf("KERNEL: user mode fault in idle; CR3=0x%x, ESP0=0x%x\n", 
	     get_cr3(), *kmem::kernel_esp());
      trap_dump(ts);
      panic("kernel is burning");
    }
#endif

  // so we were in user mode -- look for something to emulate

  // XXX the cancel flag should be reset upon return to user
  // the thread may have been canceled before we clear this flag.
  // however, in this case, all of the checks below will just fail,
  // which is OK for now.
  state_del(Thread_cancel);

  eip = ts->eip;

  // check for "lidt (%eax)"
  if (ts->trapno == 13 && ts->err == 0
      && eip < kmem::mem_user_max - 4
      && (*static_cast<unsigned long*>(eip) & 0xffffff) == 0x18010f)
    {
      // emulate "lidt (%eax)"

      // read descriptor
      if (ts->eax >= kmem::mem_user_max - 6)
	goto fail;

      x86_gate *idt = *static_cast<x86_gate**>(ts->eax + 2);
      vm_size_t limit = config::backward_compatibility ? 255
	: *static_cast<unsigned short*>(ts->eax);

      if (limit >= kmem::mem_user_max
	  || static_cast<vm_offset_t>(idt) >= kmem::mem_user_max - limit - 1)
	goto fail;

      // OK; store descriptor
      my_idt = idt;
      my_idt_limit = (limit + 1) / sizeof(x86_gate);

      // consume instruction and continue
      compare_and_swap(&ts->eip, eip, eip + 3); // ignore errors
      return true;
    }

  // let's see if we have a trampoline to invoke
  if (ts->trapno < 0x20
      && ts->trapno < my_idt_limit)
    {
      x86_gate *g = my_idt + ts->trapno;

      if ((g->word_count & 0xe0) == 0
	  && (g->access & 0x1f) == 0x0f) // gate descriptor ok?
	{
	  vm_offset_t o = (g->offset_high << 16) | g->offset_low;

	  if (o < kmem::mem_user_max // in user space?
	      && ts->esp <= kmem::mem_user_max
	      && ts->esp > 4 * 4) // enough space on user stack?
	    {
	      // OK, reflect the trap to user mode
	      unsigned32_t *esp = static_cast<unsigned32_t*>(ts->esp);
	      
	      if (! raise_exception(ts, o))
		{
		  // someone interfered and changed our state 
		  check(state_del(Thread_cancel));

		  return true;
		}
	      
	      *--esp = ts->eflags;
	      *--esp = kmem::gdt_code_user | SEL_PL_U;
	      *--esp = eip;

	      if ((ts->trapno >= 0x0a
		   && ts->trapno <= 0x0e)
		  || ts->trapno == 0x08
		  || ts->trapno == 0x11) // need to push error code?
		{
		  *--esp = ts->err;
		}

	      return true;	// we've consumed the trap
	    }
	}
    }
  
  // backward compatibility cruft: check for those insane "int3" debug
  // messaging command sequences
  if (ts->trapno == 3 
      && (ts->eflags & EFL_IOPL) == EFL_IOPL_USER) // only allow priv tasks
    {
      // no bounds checking here -- we assume privileged tasks are
      // civilized citizens :-)
      unsigned char todo = *static_cast<char*>(eip);
      bool enter_kdb = false;

      char *str;
      int len;

      switch (todo)
	{
	case 0xeb:		// jmp == enter_kdebug()
	  printf("KDB: ");

	  len = *static_cast<char*>(eip + 1);
	  str = static_cast<char*>(eip + 2);

	  enter_kdb = true;
	  goto printstr;

	case 0x90:		// nop == kd_display()
	  if (*static_cast<unsigned char*>(eip + 1) != 0xeb /*jmp*/)
	    goto nostr;

	  len = *static_cast<char*>(eip + 2);
	  str = static_cast<char*>(eip + 3);

	printstr:
	  if (len <= 0)
	    goto nostr;

	  while (len--)
	    putchar(*str++);
      
	  break;

	case 0x3c:		// cmpb
	  todo = *static_cast<char*>(eip + 1);

	  switch (todo)
	    {
	    case 0:		// outchar
	      putchar(ts->eax & 0xff);
	      break;

	    case 2:		// outstring
	      printf("%s", static_cast<const char*>(ts->eax));
	      break;

	    case 5:		// outhex32
	      printf("%08x", ts->eax);
	      break;

	    case 6:		// outhex20
	      printf("%05x", ts->eax & 0xfffff);
	      break;

	    case 7:		// outhex16
	      printf("%04x", ts->eax & 0xffff);
	      break;

	    case 11:		// outdec
	      printf("%d", ts->eax);
	      break;

	    default:		// ko
	      if (todo < ' ')
		goto nostr;

	      putchar(todo);
	    }	  

	  break;

	default:		// user breakpoint
	  goto nostr;
	}
      
      if (enter_kdb)
	{
	  if (kdb::connected())
	    {
	      printf("\n");
	      goto generic;
	    }

	  // no kernel debugger present
	  printf(" [Ret/Esc]\n"); // ask whether we should continue or panic

	  if (getchar() == '\033') // esc pressed?
	    goto generic;	// panic
	}

      return true;		// success -- consume int3

    nostr:
      printf("KDB: int3\n");
      goto generic;		// enter the kernel debugger
    }

  // privileged tasks also may invoke the kernel debugger with a debug
  // exception
  if (ts->trapno == 1
      && (ts->eflags & EFL_IOPL) == EFL_IOPL_USER) // only allow priv tasks
    {
      goto generic;
    }

  // can't handle trap -- kill the thread

fail:
  printf("KERNEL: %x.%x (tcb=0x%x) killed: unhandled trap\n",
	 unsigned(space_index()), id().id.lthread, (unsigned)this);
  trap_dump(ts);

  if (config::conservative)
    kdb::ke("thread killed");

  if (state_change_safely(~Thread_running, Thread_cancel|Thread_dead))
    {
      // we haven't been re-initialized (cancel was not set) -- so sleep
      while (! (state() & Thread_running))
	schedule();
    }
  
  return true;			
    
generic:
  if (!nested_trap_handler)
    return false;

  cli();
  //  kdb::com_port_init();		// re-initialize the GDB serial port

  // run the nested trap handler on a separate stack
  // equiv of: return nested_trap_handler(ts) == 0 ? true : false;

  int ret;
  static char nested_handler_stack[PAGE_SIZE];

  asm volatile
    (" movl %%esp,%%eax \n"
     " orl  %%ebx,%%ebx \n"	// don't set %esp if gdb fault recovery
     " jz   1f \n"
     " movl %%ebx,%%esp \n"
     "1: pushl %%eax \n"
     " pushl %%ecx \n"
     " call *%%edx \n"
     " addl $4,%%esp \n"
     " popl %%esp"
     : "=a" (ret)
     : "b" (gdb_trap_recover ?	// gdb fault recovery?
	    0 : (nested_handler_stack + sizeof(nested_handler_stack))),
       "c" (ts),
       "d" (nested_trap_handler)
     : "ecx", "edx", "memory");
     
  return ret == 0 ? true : false;
}

// the global page fault handler switch
// called from entry.S
inline bool
thread_t::handle_page_fault(vm_offset_t pfa, unsigned error_code)
{
  extern unsigned gdb_trap_recover; // in OSKit's gdb_trap.c

  if (gdb_trap_recover)
    return false; // we're in the GDB stub -- let generic handler handle it

  if (pfa < kmem::mem_user_max)	// page fault in user space
    {
      if (space_index() == config::sigma0_taskno)
	{
	  // special case: sigma0 can map in anything from the kernel

	  return ((cpu.feature_flags & CPUF_PAGE_GLOBAL_EXT) 
		  ? space()->v_insert(trunc_superpage(pfa), 
				      trunc_superpage(pfa), 
				      SUPERPAGE_SIZE)
		  : space()->v_insert(trunc_page(pfa), 
				      trunc_page(pfa), 
				      PAGE_SIZE))
	    != space_t::Insert_err_nomem;
	}

      // user mode page fault -- send pager request
      return handle_page_fault_pager(pfa, error_code);
    }

  else if (error_code & 4)	// user mode?
    {
      return false;		// disallow access after mem_user_max
    }

  // we're in kernel mode

  else if ((! (error_code & 1))	// page not present
	   && kmem::virt_to_phys(static_cast<void*>(pfa)) != 0xffffffff)
    {
      // the global page directory has a page which is not present in
      // the thread's local page dir -- copy

      space()->kmem_update(pfa);
      return true;
    }

  else if ((pfa & ~(config::thread_block_size * (1 << (7 + 11)) - 1))
	   == kmem::mem_tcbs) // in the tcb space
    {
      if (! (error_code & 1))	// page not present
	{
	  // in case of read fault, just map in the shared zero page
	  // otherwise -> allocate
	  if (!kmem::page_alloc(pfa & ~PAGE_MASK, 
				(error_code & 2) ? kmem::zero_fill
				                 : kmem::zero_map))
	    panic("can't alloc kernel page");
	}
      else
	{	  
	  // protection fault
	  // this can only be because we have the zero page mapped
	  kmem::page_free(static_cast<void*>(pfa & ~PAGE_MASK));
	  if (! kmem::page_alloc(pfa & ~PAGE_MASK, kmem::zero_fill))
	    {
	      // error could mean: someone else was faster allocating
	      // a page there, or we just don't have any pages left; verify
	      if (kmem::virt_to_phys(static_cast<void*>(pfa)) == 0xffffffff)
		panic("can't alloc kernel page");

	      // otherwise, there's a page mapped.  continue
	    }
	}

      space()->kmem_update(pfa);
      return true;
    }

  printf("KERNEL: no page fault handler for region: 0x%x, error 0x%x\n",
	 pfa, error_code);

  return false;
}

extern "C" int 
thread_page_fault(vm_offset_t pfa, unsigned error_code)
{
#if 0
  printf("*P[%x,%x,%x] ", pfa, error_code, (unsigned) __builtin_return_address(1));
#endif

  return current()->handle_page_fault(pfa, error_code);
}

// the timer interrupt is activated on every clock tick
extern "C" void
thread_timer_interrupt(void)
{
  // we're entered with disabled interrupts; that's necessary so that
  // we can't be preempted until we've acknowledged the interrupt and
  // updated the clock

  pic_disable_irq(8);
  rtcin(RTC_INTR);		// reset clock -- 1st
#if 1
  irq_ack(8);			// acknowledge interrupt
#else
  pic_ack(8);			// acknowledge interrupt
#endif
  rtcin(RTC_INTR);		// reset clock -- 2nd
  pic_enable_irq(8);

#if 0				// screen spinner for debugging purposes
  (*(unsigned char*)(kmem::phys_to_virt(0xb8000 + 16)))++;
#endif

  // update clock and handle timeouts
  timer::update_system_clock();

  // re-enable interrupts
  sti();

  // need to reschedule?
  if (--timeslice_ticks_left <= 0)
    current()->schedule();
}

// the slow trap entry point
extern "C" int
thread_handle_trap(trap_state *state)
{
  return current()->handle_slow_trap(state) 
    ? 0				// success -- handler consumed trap
    : -1;
}
