diff -uprN linux-2.6.8.1.orig/arch/alpha/kernel/ptrace.c linux-2.6.8.1-ve022stab032/arch/alpha/kernel/ptrace.c --- linux-2.6.8.1.orig/arch/alpha/kernel/ptrace.c 2004-08-14 14:56:14.000000000 +0400 +++ linux-2.6.8.1-ve022stab032/arch/alpha/kernel/ptrace.c 2005-08-11 19:41:30.000000000 +0400 @@ -354,7 +354,7 @@ do_sys_ptrace(long request, long pid, lo */ case PTRACE_KILL: ret = 0; - if (child->state == TASK_ZOMBIE) + if (child->exit_state == EXIT_ZOMBIE) break; child->exit_code = SIGKILL; /* make sure single-step breakpoint is gone. */ diff -uprN linux-2.6.8.1.orig/arch/arm/kernel/ptrace.c linux-2.6.8.1-ve022stab032/arch/arm/kernel/ptrace.c --- linux-2.6.8.1.orig/arch/arm/kernel/ptrace.c 2004-08-14 14:54:49.000000000 +0400 +++ linux-2.6.8.1-ve022stab032/arch/arm/kernel/ptrace.c 2005-08-11 19:41:30.000000000 +0400 @@ -677,7 +677,7 @@ static int do_ptrace(int request, struct /* make sure single-step breakpoint is gone. */ child->ptrace &= ~PT_SINGLESTEP; ptrace_cancel_bpt(child); - if (child->state != TASK_ZOMBIE) { + if (child->exit_state != EXIT_ZOMBIE) { child->exit_code = SIGKILL; wake_up_process(child); } diff -uprN linux-2.6.8.1.orig/arch/arm/kernel/signal.c linux-2.6.8.1-ve022stab032/arch/arm/kernel/signal.c --- linux-2.6.8.1.orig/arch/arm/kernel/signal.c 2004-08-14 14:54:47.000000000 +0400 +++ linux-2.6.8.1-ve022stab032/arch/arm/kernel/signal.c 2005-08-11 19:41:29.000000000 +0400 @@ -548,9 +548,10 @@ static int do_signal(sigset_t *oldset, s if (!user_mode(regs)) return 0; - if (current->flags & PF_FREEZE) { - refrigerator(0); - goto no_signal; + if (unlikely(test_thread_flag(TIF_FREEZE))) { + refrigerator(); + if (!signal_pending(current)) + goto no_signal; } if (current->ptrace & PT_SINGLESTEP) diff -uprN linux-2.6.8.1.orig/arch/arm26/kernel/ptrace.c linux-2.6.8.1-ve022stab032/arch/arm26/kernel/ptrace.c --- linux-2.6.8.1.orig/arch/arm26/kernel/ptrace.c 2004-08-14 14:55:32.000000000 +0400 +++ linux-2.6.8.1-ve022stab032/arch/arm26/kernel/ptrace.c 2005-08-11 19:41:30.000000000 +0400 @@ -614,7 +614,7 @@ static int do_ptrace(int request, struct /* make sure single-step breakpoint is gone. */ child->ptrace &= ~PT_SINGLESTEP; ptrace_cancel_bpt(child); - if (child->state != TASK_ZOMBIE) { + if (child->exit_state != EXIT_ZOMBIE) { child->exit_code = SIGKILL; wake_up_process(child); } diff -uprN linux-2.6.8.1.orig/arch/cris/arch-v10/kernel/ptrace.c linux-2.6.8.1-ve022stab032/arch/cris/arch-v10/kernel/ptrace.c --- linux-2.6.8.1.orig/arch/cris/arch-v10/kernel/ptrace.c 2004-08-14 14:56:23.000000000 +0400 +++ linux-2.6.8.1-ve022stab032/arch/cris/arch-v10/kernel/ptrace.c 2005-08-11 19:41:30.000000000 +0400 @@ -185,7 +185,7 @@ sys_ptrace(long request, long pid, long case PTRACE_KILL: ret = 0; - if (child->state == TASK_ZOMBIE) + if (child->exit_state == EXIT_ZOMBIE) break; child->exit_code = SIGKILL; diff -uprN linux-2.6.8.1.orig/arch/h8300/kernel/ptrace.c linux-2.6.8.1-ve022stab032/arch/h8300/kernel/ptrace.c --- linux-2.6.8.1.orig/arch/h8300/kernel/ptrace.c 2004-08-14 14:55:10.000000000 +0400 +++ linux-2.6.8.1-ve022stab032/arch/h8300/kernel/ptrace.c 2005-08-11 19:41:30.000000000 +0400 @@ -199,7 +199,7 @@ asmlinkage int sys_ptrace(long request, case PTRACE_KILL: { ret = 0; - if (child->state == TASK_ZOMBIE) /* already dead */ + if (child->exit_state == EXIT_ZOMBIE) /* already dead */ break; child->exit_code = SIGKILL; h8300_disable_trace(child); diff -uprN linux-2.6.8.1.orig/arch/i386/boot/setup.S linux-2.6.8.1-ve022stab032/arch/i386/boot/setup.S --- linux-2.6.8.1.orig/arch/i386/boot/setup.S 2004-08-14 14:55:33.000000000 +0400 +++ linux-2.6.8.1-ve022stab032/arch/i386/boot/setup.S 2005-08-11 19:41:30.000000000 +0400 @@ -156,7 +156,7 @@ cmd_line_ptr: .long 0 # (Header versio # can be located anywhere in # low memory 0x10000 or higher. -ramdisk_max: .long (MAXMEM-1) & 0x7fffffff +ramdisk_max: .long (__MAXMEM-1) & 0x7fffffff # (Header version 0x0203 or later) # The highest safe address for # the contents of an initrd diff -uprN linux-2.6.8.1.orig/arch/i386/Kconfig.open_virtuozzo linux-2.6.8.1-ve022stab032/arch/i386/Kconfig.open_virtuozzo --- linux-2.6.8.1.orig/arch/i386/Kconfig.open_virtuozzo 1970-01-01 03:00:00.000000000 +0300 +++ linux-2.6.8.1-ve022stab032/arch/i386/Kconfig.open_virtuozzo 2005-08-11 19:41:31.000000000 +0400 @@ -0,0 +1,44 @@ +config VE + bool "Virtual Environment support" + default y + help + This option adds support of virtual Linux running on the original box + with fully supported virtual network driver, tty subsystem and + configurable access for hardware and other resources. + +config VE_CALLS + tristate "VE calls interface" + depends on VE + default m + help + This option controls how to build vzmon code containing VE calls. + By default it's build in module vzmon.o + +config VE_SYSFS + bool "Enable sysfs support in Virtual Environments" + depends on VE + default n + help + This option enables sysfs support in Virtual Environments + +config VE_NETDEV + tristate "VE networking" + depends on VE + default m + help + This option controls whether to build VE networking code. + +config VE_IPTABLES + bool "VE netfiltering" + depends on VE && VE_NETDEV && INET && NETFILTER + default y + help + This option controls whether to build VE netfiltering code. + +config VZ_WDOG + tristate "VE watchdog module" + depends on VE + default m + help + This option controls building of vzwdog module, which dumps + a lot of useful system info on console periodically. diff -uprN linux-2.6.8.1.orig/arch/i386/kernel/acpi/boot.c linux-2.6.8.1-ve022stab032/arch/i386/kernel/acpi/boot.c --- linux-2.6.8.1.orig/arch/i386/kernel/acpi/boot.c 2004-08-14 14:56:01.000000000 +0400 +++ linux-2.6.8.1-ve022stab032/arch/i386/kernel/acpi/boot.c 2005-08-11 19:41:30.000000000 +0400 @@ -484,7 +484,7 @@ acpi_scan_rsdp ( * RSDP signature. */ for (offset = 0; offset < length; offset += 16) { - if (strncmp((char *) (start + offset), "RSD PTR ", sig_len)) + if (strncmp((char *) __va(start + offset), "RSD PTR ", sig_len)) continue; return (start + offset); } diff -uprN linux-2.6.8.1.orig/arch/i386/kernel/acpi/sleep.c linux-2.6.8.1-ve022stab032/arch/i386/kernel/acpi/sleep.c --- linux-2.6.8.1.orig/arch/i386/kernel/acpi/sleep.c 2004-08-14 14:55:32.000000000 +0400 +++ linux-2.6.8.1-ve022stab032/arch/i386/kernel/acpi/sleep.c 2005-08-11 19:41:30.000000000 +0400 @@ -19,13 +19,29 @@ extern void zap_low_mappings(void); extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long)); -static void init_low_mapping(pgd_t *pgd, int pgd_limit) +static void map_low(pgd_t *pgd_base, unsigned long start, unsigned long end) { - int pgd_ofs = 0; - - while ((pgd_ofs < pgd_limit) && (pgd_ofs + USER_PTRS_PER_PGD < PTRS_PER_PGD)) { - set_pgd(pgd, *(pgd+USER_PTRS_PER_PGD)); - pgd_ofs++, pgd++; + unsigned long vaddr; + pmd_t *pmd; + pgd_t *pgd; + int i, j; + + pgd = pgd_base; + + for (i = 0; i < PTRS_PER_PGD; pgd++, i++) { + vaddr = i*PGDIR_SIZE; + if (end && (vaddr >= end)) + break; + pmd = pmd_offset(pgd, 0); + for (j = 0; j < PTRS_PER_PMD; pmd++, j++) { + vaddr = i*PGDIR_SIZE + j*PMD_SIZE; + if (end && (vaddr >= end)) + break; + if (vaddr < start) + continue; + set_pmd(pmd, __pmd(_KERNPG_TABLE + _PAGE_PSE + + vaddr - start)); + } } } @@ -39,7 +55,9 @@ int acpi_save_state_mem (void) { if (!acpi_wakeup_address) return 1; - init_low_mapping(swapper_pg_dir, USER_PTRS_PER_PGD); + if (!cpu_has_pse) + return 1; + map_low(swapper_pg_dir, 0, LOW_MAPPINGS_SIZE); memcpy((void *) acpi_wakeup_address, &wakeup_start, &wakeup_end - &wakeup_start); acpi_copy_wakeup_routine(acpi_wakeup_address); diff -uprN linux-2.6.8.1.orig/arch/i386/kernel/acpi/wakeup.S linux-2.6.8.1-ve022stab032/arch/i386/kernel/acpi/wakeup.S --- linux-2.6.8.1.orig/arch/i386/kernel/acpi/wakeup.S 2004-08-14 14:54:51.000000000 +0400 +++ linux-2.6.8.1-ve022stab032/arch/i386/kernel/acpi/wakeup.S 2005-08-11 19:41:30.000000000 +0400 @@ -67,6 +67,13 @@ wakeup_code: movw $0x0e00 + 'i', %fs:(0x12) # need a gdt + #use the gdt copied in this low mem + lea temp_gdt_table - wakeup_code, %eax + xor %ebx, %ebx + movw %ds, %bx + shll $4, %ebx + addl %ebx, %eax + movl %eax, real_save_gdt + 2 - wakeup_code lgdt real_save_gdt - wakeup_code movl real_save_cr0 - wakeup_code, %eax @@ -89,6 +96,7 @@ real_save_cr4: .long 0 real_magic: .long 0 video_mode: .long 0 video_flags: .long 0 +temp_gdt_table: .fill GDT_ENTRIES, 8, 0 bogus_real_magic: movw $0x0e00 + 'B', %fs:(0x12) @@ -231,6 +239,13 @@ ENTRY(acpi_copy_wakeup_routine) movl %edx, real_save_cr0 - wakeup_start (%eax) sgdt real_save_gdt - wakeup_start (%eax) + # gdt wont be addressable from real mode in 4g4g split + # copying it to the lower mem + xor %ecx, %ecx + movw saved_gdt, %cx + movl saved_gdt + 2, %esi + lea temp_gdt_table - wakeup_start (%eax), %edi + rep movsb movl saved_videomode, %edx movl %edx, video_mode - wakeup_start (%eax) movl acpi_video_flags, %edx diff -uprN linux-2.6.8.1.orig/arch/i386/kernel/apic.c linux-2.6.8.1-ve022stab032/arch/i386/kernel/apic.c --- linux-2.6.8.1.orig/arch/i386/kernel/apic.c 2004-08-14 14:56:24.000000000 +0400 +++ linux-2.6.8.1-ve022stab032/arch/i386/kernel/apic.c 2005-08-11 19:41:30.000000000 +0400 @@ -1089,6 +1089,7 @@ inline void smp_local_timer_interrupt(st void smp_apic_timer_interrupt(struct pt_regs regs) { int cpu = smp_processor_id(); + struct ve_struct *envid; /* * the NMI deadlock-detector uses this. @@ -1105,9 +1106,11 @@ void smp_apic_timer_interrupt(struct pt_ * Besides, if we don't timer interrupts ignore the global * interrupt lock, which is the WrongThing (tm) to do. */ + envid = set_exec_env(get_ve0()); irq_enter(); smp_local_timer_interrupt(®s); irq_exit(); + (void)set_exec_env(envid); } /* diff -uprN linux-2.6.8.1.orig/arch/i386/kernel/asm-offsets.c linux-2.6.8.1-ve022stab032/arch/i386/kernel/asm-offsets.c --- linux-2.6.8.1.orig/arch/i386/kernel/asm-offsets.c 2004-08-14 14:55:10.000000000 +0400 +++ linux-2.6.8.1-ve022stab032/arch/i386/kernel/asm-offsets.c 2005-08-11 19:41:30.000000000 +0400 @@ -61,5 +61,19 @@ void foo(void) DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, esp0) - sizeof(struct tss_struct)); + DEFINE(TI_task, offsetof (struct thread_info, task)); + DEFINE(TI_exec_domain, offsetof (struct thread_info, exec_domain)); + DEFINE(TI_flags, offsetof (struct thread_info, flags)); + DEFINE(TI_preempt_count, offsetof (struct thread_info, preempt_count)); + DEFINE(TI_addr_limit, offsetof (struct thread_info, addr_limit)); + DEFINE(TI_real_stack, offsetof (struct thread_info, real_stack)); + DEFINE(TI_virtual_stack, offsetof (struct thread_info, virtual_stack)); + DEFINE(TI_user_pgd, offsetof (struct thread_info, user_pgd)); + + DEFINE(FIX_ENTRY_TRAMPOLINE_0_addr, + __fix_to_virt(FIX_ENTRY_TRAMPOLINE_0)); + DEFINE(FIX_VSYSCALL_addr, __fix_to_virt(FIX_VSYSCALL)); DEFINE(PAGE_SIZE_asm, PAGE_SIZE); + DEFINE(task_thread_db7, + offsetof (struct task_struct, thread.debugreg[7])); } diff -uprN linux-2.6.8.1.orig/arch/i386/kernel/cpu/common.c linux-2.6.8.1-ve022stab032/arch/i386/kernel/cpu/common.c --- linux-2.6.8.1.orig/arch/i386/kernel/cpu/common.c 2004-08-14 14:54:48.000000000 +0400 +++ linux-2.6.8.1-ve022stab032/arch/i386/kernel/cpu/common.c 2005-08-11 19:41:30.000000000 +0400 @@ -554,12 +554,16 @@ void __init cpu_init (void) set_tss_desc(cpu,t); cpu_gdt_table[cpu][GDT_ENTRY_TSS].b &= 0xfffffdff; load_TR_desc(); - load_LDT(&init_mm.context); + if (cpu) + load_LDT(&init_mm.context); /* Set up doublefault TSS pointer in the GDT */ __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); cpu_gdt_table[cpu][GDT_ENTRY_DOUBLEFAULT_TSS].b &= 0xfffffdff; + if (cpu) + trap_init_virtual_GDT(); + /* Clear %fs and %gs. */ asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs"); diff -uprN linux-2.6.8.1.orig/arch/i386/kernel/cpu/intel.c linux-2.6.8.1-ve022stab032/arch/i386/kernel/cpu/intel.c --- linux-2.6.8.1.orig/arch/i386/kernel/cpu/intel.c 2004-08-14 14:55:09.000000000 +0400 +++ linux-2.6.8.1-ve022stab032/arch/i386/kernel/cpu/intel.c 2005-08-11 19:41:30.000000000 +0400 @@ -10,6 +10,7 @@ #include #include #include +#include #include "cpu.h" @@ -19,8 +20,6 @@ #include #endif -extern int trap_init_f00f_bug(void); - #ifdef CONFIG_X86_INTEL_USERCOPY /* * Alignment at which movsl is preferred for bulk memory copies. @@ -147,7 +146,7 @@ static void __init init_intel(struct cpu c->f00f_bug = 1; if ( !f00f_workaround_enabled ) { - trap_init_f00f_bug(); + trap_init_virtual_IDT(); printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n"); f00f_workaround_enabled = 1; } diff -uprN linux-2.6.8.1.orig/arch/i386/kernel/cpu/mtrr/if.c linux-2.6.8.1-ve022stab032/arch/i386/kernel/cpu/mtrr/if.c --- linux-2.6.8.1.orig/arch/i386/kernel/cpu/mtrr/if.c 2004-08-14 14:54:51.000000000 +0400 +++ linux-2.6.8.1-ve022stab032/arch/i386/kernel/cpu/mtrr/if.c 2005-08-11 19:41:30.000000000 +0400 @@ -358,7 +358,7 @@ static int __init mtrr_if_init(void) return -ENODEV; proc_root_mtrr = - create_proc_entry("mtrr", S_IWUSR | S_IRUGO, &proc_root); + create_proc_entry("mtrr", S_IWUSR | S_IRUGO, NULL); if (proc_root_mtrr) { proc_root_mtrr->owner = THIS_MODULE; proc_root_mtrr->proc_fops = &mtrr_fops; diff -uprN linux-2.6.8.1.orig/arch/i386/kernel/cpu/proc.c linux-2.6.8.1-ve022stab032/arch/i386/kernel/cpu/proc.c --- linux-2.6.8.1.orig/arch/i386/kernel/cpu/proc.c 2004-08-14 14:56:09.000000000 +0400 +++ linux-2.6.8.1-ve022stab032/arch/i386/kernel/cpu/proc.c 2005-08-11 19:41:31.000000000 +0400 @@ -3,6 +3,8 @@ #include #include #include +#include +#include /* * Get CPU information for use by the procfs. @@ -58,11 +60,17 @@ static int show_cpuinfo(struct seq_file struct cpuinfo_x86 *c = v; int i, n = c - cpu_data; int fpu_exception; + unsigned long vcpu_khz; #ifdef CONFIG_SMP - if (!cpu_online(n)) + if (!vcpu_online(n)) return 0; #endif +#ifdef CONFIG_VE + vcpu_khz = ve_scale_khz(cpu_khz); +#else + vcpu_khz = cpu_khz; +#endif seq_printf(m, "processor\t: %d\n" "vendor_id\t: %s\n" "cpu family\t: %d\n" @@ -81,7 +89,7 @@ static int show_cpuinfo(struct seq_file if ( cpu_has(c, X86_FEATURE_TSC) ) { seq_printf(m, "cpu MHz\t\t: %lu.%03lu\n", - cpu_khz / 1000, (cpu_khz % 1000)); + vcpu_khz / 1000, (vcpu_khz % 1000)); } /* Cache size */ diff -uprN linux-2.6.8.1.orig/arch/i386/kernel/doublefault.c linux-2.6.8.1-ve022stab032/arch/i386/kernel/doublefault.c --- linux-2.6.8.1.orig/arch/i386/kernel/doublefault.c 2004-08-14 14:54:50.000000000 +0400 +++ linux-2.6.8.1-ve022stab032/arch/i386/kernel/doublefault.c 2005-08-11 19:41:30.000000000 +0400 @@ -8,12 +8,13 @@ #include #include #include +#include #define DOUBLEFAULT_STACKSIZE (1024) static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE]; #define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE) -#define ptr_ok(x) ((x) > 0xc0000000 && (x) < 0xc1000000) +#define ptr_ok(x) (((x) > __PAGE_OFFSET && (x) < (__PAGE_OFFSET + 0x01000000)) || ((x) >= FIXADDR_START)) static void doublefault_fn(void) { @@ -39,8 +40,8 @@ static void doublefault_fn(void) printk("eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n", t->eax, t->ebx, t->ecx, t->edx); - printk("esi = %08lx, edi = %08lx\n", - t->esi, t->edi); + printk("esi = %08lx, edi = %08lx, ebp = %08lx\n", + t->esi, t->edi, t->ebp); } } diff -uprN linux-2.6.8.1.orig/arch/i386/kernel/entry.S linux-2.6.8.1-ve022stab032/arch/i386/kernel/entry.S --- linux-2.6.8.1.orig/arch/i386/kernel/entry.S 2004-08-14 14:55:09.000000000 +0400 +++ linux-2.6.8.1-ve022stab032/arch/i386/kernel/entry.S 2005-08-11 19:41:30.000000000 +0400 @@ -43,8 +43,10 @@ #include #include #include +#include #include #include +#include #include #include #include "irq_vectors.h" @@ -81,7 +83,102 @@ VM_MASK = 0x00020000 #define resume_kernel restore_all #endif -#define SAVE_ALL \ +#ifdef CONFIG_X86_HIGH_ENTRY + +#ifdef CONFIG_X86_SWITCH_PAGETABLES + +#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) +/* + * If task is preempted in __SWITCH_KERNELSPACE, and moved to another cpu, + * __switch_to repoints %esp to the appropriate virtual stack; but %ebp is + * left stale, so we must check whether to repeat the real stack calculation. + */ +#define repeat_if_esp_changed \ + xorl %esp, %ebp; \ + testl $-THREAD_SIZE, %ebp; \ + jnz 0b +#else +#define repeat_if_esp_changed +#endif + +/* clobbers ebx, edx and ebp */ + +#define __SWITCH_KERNELSPACE \ + cmpl $0xff000000, %esp; \ + jb 1f; \ + \ + /* \ + * switch pagetables and load the real stack, \ + * keep the stack offset: \ + */ \ + \ + movl $swapper_pg_dir-__PAGE_OFFSET, %edx; \ + \ + /* GET_THREAD_INFO(%ebp) intermixed */ \ +0: \ + movl %esp, %ebp; \ + movl %esp, %ebx; \ + andl $(-THREAD_SIZE), %ebp; \ + andl $(THREAD_SIZE-1), %ebx; \ + orl TI_real_stack(%ebp), %ebx; \ + repeat_if_esp_changed; \ + \ + movl %edx, %cr3; \ + movl %ebx, %esp; \ +1: + +#endif + + +#define __SWITCH_USERSPACE \ + /* interrupted any of the user return paths? */ \ + \ + movl EIP(%esp), %eax; \ + \ + cmpl $int80_ret_start_marker, %eax; \ + jb 33f; /* nope - continue with sysexit check */\ + cmpl $int80_ret_end_marker, %eax; \ + jb 22f; /* yes - switch to virtual stack */ \ +33: \ + cmpl $sysexit_ret_start_marker, %eax; \ + jb 44f; /* nope - continue with user check */ \ + cmpl $sysexit_ret_end_marker, %eax; \ + jb 22f; /* yes - switch to virtual stack */ \ + /* return to userspace? */ \ +44: \ + movl EFLAGS(%esp),%ecx; \ + movb CS(%esp),%cl; \ + testl $(VM_MASK | 3),%ecx; \ + jz 2f; \ +22: \ + /* \ + * switch to the virtual stack, then switch to \ + * the userspace pagetables. \ + */ \ + \ + GET_THREAD_INFO(%ebp); \ + movl TI_virtual_stack(%ebp), %edx; \ + movl TI_user_pgd(%ebp), %ecx; \ + \ + movl %esp, %ebx; \ + andl $(THREAD_SIZE-1), %ebx; \ + orl %ebx, %edx; \ +int80_ret_start_marker: \ + movl %edx, %esp; \ + movl %ecx, %cr3; \ + \ + __RESTORE_ALL_USER; \ +int80_ret_end_marker: \ +2: + +#else /* !CONFIG_X86_HIGH_ENTRY */ + +#define __SWITCH_KERNELSPACE +#define __SWITCH_USERSPACE + +#endif + +#define __SAVE_ALL \ cld; \ pushl %es; \ pushl %ds; \ @@ -96,7 +193,7 @@ VM_MASK = 0x00020000 movl %edx, %ds; \ movl %edx, %es; -#define RESTORE_INT_REGS \ +#define __RESTORE_INT_REGS \ popl %ebx; \ popl %ecx; \ popl %edx; \ @@ -105,29 +202,44 @@ VM_MASK = 0x00020000 popl %ebp; \ popl %eax -#define RESTORE_REGS \ - RESTORE_INT_REGS; \ -1: popl %ds; \ -2: popl %es; \ -.section .fixup,"ax"; \ -3: movl $0,(%esp); \ - jmp 1b; \ -4: movl $0,(%esp); \ - jmp 2b; \ -.previous; \ +#define __RESTORE_REGS \ + __RESTORE_INT_REGS; \ + popl %ds; \ + popl %es; + +#define __RESTORE_REGS_USER \ + __RESTORE_INT_REGS; \ +111: popl %ds; \ +222: popl %es; \ + jmp 666f; \ +444: movl $0,(%esp); \ + jmp 111b; \ +555: movl $0,(%esp); \ + jmp 222b; \ +666: \ .section __ex_table,"a";\ .align 4; \ - .long 1b,3b; \ - .long 2b,4b; \ + .long 111b,444b;\ + .long 222b,555b;\ .previous +#define __RESTORE_ALL_USER \ + __RESTORE_REGS_USER \ + __RESTORE_IRET + +#ifdef CONFIG_X86_HIGH_ENTRY +#define __RESTORE_ALL \ + __RESTORE_REGS \ + __RESTORE_IRET +#else /* !CONFIG_X86_HIGH_ENTRY */ +#define __RESTORE_ALL __RESTORE_ALL_USER +#endif -#define RESTORE_ALL \ - RESTORE_REGS \ +#define __RESTORE_IRET \ addl $4, %esp; \ -1: iret; \ +333: iret; \ .section .fixup,"ax"; \ -2: sti; \ +666: sti; \ movl $(__USER_DS), %edx; \ movl %edx, %ds; \ movl %edx, %es; \ @@ -136,10 +248,18 @@ VM_MASK = 0x00020000 .previous; \ .section __ex_table,"a";\ .align 4; \ - .long 1b,2b; \ + .long 333b,666b;\ .previous +#define SAVE_ALL \ + __SAVE_ALL; \ + __SWITCH_KERNELSPACE; + +#define RESTORE_ALL \ + __SWITCH_USERSPACE; \ + __RESTORE_ALL; +.section .entry.text,"ax" ENTRY(lcall7) pushfl # We get a different stack layout with call @@ -240,17 +360,9 @@ sysenter_past_esp: pushl $(__USER_CS) pushl $SYSENTER_RETURN -/* - * Load the potential sixth argument from user stack. - * Careful about security. - */ - cmpl $__PAGE_OFFSET-3,%ebp - jae syscall_fault -1: movl (%ebp),%ebp -.section __ex_table,"a" - .align 4 - .long 1b,syscall_fault -.previous + /* + * No six-argument syscall is ever used with sysenter. + */ pushl %eax SAVE_ALL @@ -266,12 +378,34 @@ sysenter_past_esp: movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx jne syscall_exit_work + +#ifdef CONFIG_X86_SWITCH_PAGETABLES + + GET_THREAD_INFO(%ebp) + movl TI_virtual_stack(%ebp), %edx + movl TI_user_pgd(%ebp), %ecx + movl %esp, %ebx + andl $(THREAD_SIZE-1), %ebx + orl %ebx, %edx +sysexit_ret_start_marker: + movl %edx, %esp + movl %ecx, %cr3 + /* + * only ebx is not restored by the userspace sysenter vsyscall + * code, it assumes it to be callee-saved. + */ + movl EBX(%esp), %ebx +#endif + /* if something modifies registers it must also disable sysexit */ movl EIP(%esp), %edx movl OLDESP(%esp), %ecx sti sysexit - +#ifdef CONFIG_X86_SWITCH_PAGETABLES +sysexit_ret_end_marker: + nop +#endif # system call handler stub ENTRY(system_call) @@ -321,6 +455,22 @@ work_notifysig: # deal with pending s # vm86-space xorl %edx, %edx call do_notify_resume + +#if CONFIG_X86_HIGH_ENTRY + /* + * Reload db7 if necessary: + */ + movl TI_flags(%ebp), %ecx + testb $_TIF_DB7, %cl + jnz work_db7 + + jmp restore_all + +work_db7: + movl TI_task(%ebp), %edx; + movl task_thread_db7(%edx), %edx; + movl %edx, %db7; +#endif jmp restore_all ALIGN @@ -358,14 +508,6 @@ syscall_exit_work: jmp resume_userspace ALIGN -syscall_fault: - pushl %eax # save orig_eax - SAVE_ALL - GET_THREAD_INFO(%ebp) - movl $-EFAULT,EAX(%esp) - jmp resume_userspace - - ALIGN syscall_badsys: movl $-ENOSYS,EAX(%esp) jmp resume_userspace @@ -376,7 +518,7 @@ syscall_badsys: */ .data ENTRY(interrupt) -.text +.previous vector=0 ENTRY(irq_entries_start) @@ -386,7 +528,7 @@ ENTRY(irq_entries_start) jmp common_interrupt .data .long 1b -.text +.previous vector=vector+1 .endr @@ -427,12 +569,17 @@ error_code: movl ES(%esp), %edi # get the function address movl %eax, ORIG_EAX(%esp) movl %ecx, ES(%esp) - movl %esp, %edx pushl %esi # push the error code - pushl %edx # push the pt_regs pointer movl $(__USER_DS), %edx movl %edx, %ds movl %edx, %es + +/* clobbers edx, ebx and ebp */ + __SWITCH_KERNELSPACE + + leal 4(%esp), %edx # prepare pt_regs + pushl %edx # push pt_regs + call *%edi addl $8, %esp jmp ret_from_exception @@ -523,7 +670,7 @@ nmi_stack_correct: pushl %edx call do_nmi addl $8, %esp - RESTORE_ALL + jmp restore_all nmi_stack_fixup: FIX_STACK(12,nmi_stack_correct, 1) @@ -600,6 +747,8 @@ ENTRY(spurious_interrupt_bug) pushl $do_spurious_interrupt_bug jmp error_code +.previous + .data ENTRY(sys_call_table) .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */ @@ -887,4 +1036,22 @@ ENTRY(sys_call_table) .long sys_mq_getsetattr .long sys_ni_syscall /* reserved for kexec */ + .rept 500-(.-sys_call_table)/4 + .long sys_ni_syscall + .endr + .long sys_fairsched_mknod /* 500 */ + .long sys_fairsched_rmnod + .long sys_fairsched_chwt + .long sys_fairsched_mvpr + .long sys_fairsched_rate + + .rept 510-(.-sys_call_table)/4 + .long sys_ni_syscall + .endr + + .long sys_getluid /* 510 */ + .long sys_setluid + .long sys_setublimit + .long sys_ubstat + syscall_table_size=(.-sys_call_table) diff -uprN linux-2.6.8.1.orig/arch/i386/kernel/entry_trampoline.c linux-2.6.8.1-ve022stab032/arch/i386/kernel/entry_trampoline.c --- linux-2.6.8.1.orig/arch/i386/kernel/entry_trampoline.c 1970-01-01 03:00:00.000000000 +0300 +++ linux-2.6.8.1-ve022stab032/arch/i386/kernel/entry_trampoline.c 2005-08-11 19:41:30.000000000 +0400 @@ -0,0 +1,75 @@ +/* + * linux/arch/i386/kernel/entry_trampoline.c + * + * (C) Copyright 2003 Ingo Molnar + * + * This file contains the needed support code for 4GB userspace + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern char __entry_tramp_start, __entry_tramp_end, __start___entry_text; + +void __init init_entry_mappings(void) +{ +#ifdef CONFIG_X86_HIGH_ENTRY + + void *tramp; + int p; + + /* + * We need a high IDT and GDT for the 4G/4G split: + */ + trap_init_virtual_IDT(); + + __set_fixmap(FIX_ENTRY_TRAMPOLINE_0, __pa((unsigned long)&__entry_tramp_start), PAGE_KERNEL_EXEC); + __set_fixmap(FIX_ENTRY_TRAMPOLINE_1, __pa((unsigned long)&__entry_tramp_start) + PAGE_SIZE, PAGE_KERNEL_EXEC); + tramp = (void *)fix_to_virt(FIX_ENTRY_TRAMPOLINE_0); + + printk("mapped 4G/4G trampoline to %p.\n", tramp); + BUG_ON((void *)&__start___entry_text != tramp); + /* + * Virtual kernel stack: + */ + BUG_ON(__kmap_atomic_vaddr(KM_VSTACK_TOP) & (THREAD_SIZE-1)); + BUG_ON(sizeof(struct desc_struct)*NR_CPUS*GDT_ENTRIES > 2*PAGE_SIZE); + BUG_ON((unsigned int)&__entry_tramp_end - (unsigned int)&__entry_tramp_start > 2*PAGE_SIZE); + + /* + * set up the initial thread's virtual stack related + * fields: + */ + for (p = 0; p < ARRAY_SIZE(current->thread_info->stack_page); p++) + current->thread_info->stack_page[p] = virt_to_page((char *)current->thread_info + (p*PAGE_SIZE)); + + current->thread_info->virtual_stack = (void *)__kmap_atomic_vaddr(KM_VSTACK_TOP); + + for (p = 0; p < ARRAY_SIZE(current->thread_info->stack_page); p++) { + __kunmap_atomic_type(KM_VSTACK_TOP-p); + __kmap_atomic(current->thread_info->stack_page[p], KM_VSTACK_TOP-p); + } +#endif + current->thread_info->real_stack = (void *)current->thread_info; + current->thread_info->user_pgd = NULL; + current->thread.esp0 = (unsigned long)current->thread_info->real_stack + THREAD_SIZE; +} + + + +void __init entry_trampoline_setup(void) +{ + /* + * old IRQ entries set up by the boot code will still hang + * around - they are a sign of hw trouble anyway, now they'll + * produce a double fault message. + */ + trap_init_virtual_GDT(); +} diff -uprN linux-2.6.8.1.orig/arch/i386/kernel/i386_ksyms.c linux-2.6.8.1-ve022stab032/arch/i386/kernel/i386_ksyms.c --- linux-2.6.8.1.orig/arch/i386/kernel/i386_ksyms.c 2004-08-14 14:56:23.000000000 +0400 +++ linux-2.6.8.1-ve022stab032/arch/i386/kernel/i386_ksyms.c 2005-08-11 19:41:30.000000000 +0400 @@ -92,7 +92,6 @@ EXPORT_SYMBOL_NOVERS(__down_failed_inter EXPORT_SYMBOL_NOVERS(__down_failed_trylock); EXPORT_SYMBOL_NOVERS(__up_wakeup); /* Networking helper routines. */ -EXPORT_SYMBOL(csum_partial_copy_generic); /* Delay loops */ EXPORT_SYMBOL(__ndelay); EXPORT_SYMBOL(__udelay); @@ -106,13 +105,17 @@ EXPORT_SYMBOL_NOVERS(__get_user_4); EXPORT_SYMBOL(strpbrk); EXPORT_SYMBOL(strstr); +#if !defined(CONFIG_X86_UACCESS_INDIRECT) EXPORT_SYMBOL(strncpy_from_user); -EXPORT_SYMBOL(__strncpy_from_user); +EXPORT_SYMBOL(__direct_strncpy_from_user); EXPORT_SYMBOL(clear_user); EXPORT_SYMBOL(__clear_user); EXPORT_SYMBOL(__copy_from_user_ll); EXPORT_SYMBOL(__copy_to_user_ll); EXPORT_SYMBOL(strnlen_user); +#else /* CONFIG_X86_UACCESS_INDIRECT */ +EXPORT_SYMBOL(direct_csum_partial_copy_generic); +#endif EXPORT_SYMBOL(dma_alloc_coherent); EXPORT_SYMBOL(dma_free_coherent); diff -uprN linux-2.6.8.1.orig/arch/i386/kernel/i387.c linux-2.6.8.1-ve022stab032/arch/i386/kernel/i387.c --- linux-2.6.8.1.orig/arch/i386/kernel/i387.c 2004-08-14 14:56:24.000000000 +0400 +++ linux-2.6.8.1-ve022stab032/arch/i386/kernel/i387.c 2005-08-11 19:41:30.000000000 +0400 @@ -227,6 +227,7 @@ void set_fpu_twd( struct task_struct *ts static int convert_fxsr_to_user( struct _fpstate __user *buf, struct i387_fxsave_struct *fxsave ) { + struct _fpreg tmp[8]; /* 80 bytes scratch area */ unsigned long env[7]; struct _fpreg __user *to; struct _fpxreg *from; @@ -243,23 +244,25 @@ static int convert_fxsr_to_user( struct if ( __copy_to_user( buf, env, 7 * sizeof(unsigned long) ) ) return 1; - to = &buf->_st[0]; + to = tmp; from = (struct _fpxreg *) &fxsave->st_space[0]; for ( i = 0 ; i < 8 ; i++, to++, from++ ) { unsigned long __user *t = (unsigned long __user *)to; unsigned long *f = (unsigned long *)from; - if (__put_user(*f, t) || - __put_user(*(f + 1), t + 1) || - __put_user(from->exponent, &to->exponent)) - return 1; + *t = *f; + *(t + 1) = *(f+1); + to->exponent = from->exponent; } + if (copy_to_user(buf->_st, tmp, sizeof(struct _fpreg [8]))) + return 1; return 0; } static int convert_fxsr_from_user( struct i387_fxsave_struct *fxsave, struct _fpstate __user *buf ) { + struct _fpreg tmp[8]; /* 80 bytes scratch area */ unsigned long env[7]; struct _fpxreg *to; struct _fpreg __user *from; @@ -267,6 +270,8 @@ static int convert_fxsr_from_user( struc if ( __copy_from_user( env, buf, 7 * sizeof(long) ) ) return 1; + if (copy_from_user(tmp, buf->_st, sizeof(struct _fpreg [8]))) + return 1; fxsave->cwd = (unsigned short)(env[0] & 0xffff); fxsave->swd = (unsigned short)(env[1] & 0xffff); @@ -278,15 +283,14 @@ static int convert_fxsr_from_user( struc fxsave->fos = env[6]; to = (struct _fpxreg *) &fxsave->st_space[0]; - from = &buf->_st[0]; + from = tmp; for ( i = 0 ; i < 8 ; i++, to++, from++ ) { unsigned long *t = (unsigned long *)to; unsigned long __user *f = (unsigned long __user *)from; - if (__get_user(*t, f) || - __get_user(*(t + 1), f + 1) || - __get_user(to->exponent, &from->exponent)) - return 1; + *t = *f; + *(t + 1) = *(f + 1); + to->exponent = from->exponent; } return 0; } diff -uprN linux-2.6.8.1.orig/arch/i386/kernel/init_task.c linux-2.6.8.1-ve022stab032/arch/i386/kernel/init_task.c --- linux-2.6.8.1.orig/arch/i386/kernel/init_task.c 2004-08-14 14:56:23.000000000 +0400 +++ linux-2.6.8.1-ve022stab032/arch/i386/kernel/init_task.c 2005-08-11 19:41:30.000000000 +0400 @@ -27,7 +27,7 @@ EXPORT_SYMBOL(init_mm); */ union thread_union init_thread_union __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; + { INIT_THREAD_INFO(init_task, init_thread_union) }; /* * Initial task structure. @@ -45,5 +45,5 @@ EXPORT_SYMBOL(init_task); * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -struct tss_struct init_tss[NR_CPUS] __cacheline_aligned = { [0 ... NR_CPUS-1] = INIT_TSS }; +struct tss_struct init_tss[NR_CPUS] __attribute__((__section__(".data.tss"))) = { [0 ... NR_CPUS-1] = INIT_TSS }; diff -uprN linux-2.6.8.1.orig/arch/i386/kernel/irq.c linux-2.6.8.1-ve022stab032/arch/i386/kernel/irq.c --- linux-2.6.8.1.orig/arch/i386/kernel/irq.c 2004-08-14 14:54:48.000000000 +0400 +++ linux-2.6.8.1-ve022stab032/arch/i386/kernel/irq.c 2005-08-11 19:41:30.000000000 +0400 @@ -45,6 +45,9 @@ #include #include +#include +#include + /* * Linux has a controller-independent x86 interrupt architecture. * every controller has a 'controller-template', that is used @@ -221,15 +224,19 @@ asmlinkage int handle_IRQ_event(unsigned { int status = 1; /* Force the "do bottom halves" bit */ int retval = 0; + struct user_beancounter *ub; if (!(action->flags & SA_INTERRUPT)) local_irq_enable(); + ub = set_exec_ub(get_ub0()); do { status |= action->flags; retval |= action->handler(irq, action->dev_id, regs); action = action->next; } while (action); + (void)set_exec_ub(ub); + if (status & SA_SAMPLE_RANDOM) add_interrupt_randomness(irq); local_irq_disable(); @@ -429,7 +436,9 @@ asmlinkage unsigned int do_IRQ(struct pt irq_desc_t *desc = irq_desc + irq; struct irqaction * action; unsigned int status; + struct ve_struct *envid; + envid = set_exec_env(get_ve0()); irq_enter(); #ifdef CONFIG_DEBUG_STACKOVERFLOW @@ -513,6 +522,8 @@ asmlinkage unsigned int do_IRQ(struct pt /* build the stack frame on the IRQ stack */ isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); irqctx->tinfo.task = curctx->tinfo.task; + irqctx->tinfo.real_stack = curctx->tinfo.real_stack; + irqctx->tinfo.virtual_stack = curctx->tinfo.virtual_stack; irqctx->tinfo.previous_esp = current_stack_pointer(); *--isp = (u32) action; @@ -568,6 +579,7 @@ out: spin_unlock(&desc->lock); irq_exit(); + (void)set_exec_env(envid); return 1; } @@ -1173,6 +1185,8 @@ asmlinkage void do_softirq(void) curctx = current_thread_info(); irqctx = softirq_ctx[smp_processor_id()]; irqctx->tinfo.task = curctx->task; + irqctx->tinfo.real_stack = curctx->real_stack; + irqctx->tinfo.virtual_stack = curctx->virtual_stack; irqctx->tinfo.previous_esp = current_stack_pointer(); /* build the stack frame on the softirq stack */ diff -uprN linux-2.6.8.1.orig/arch/i386/kernel/ldt.c linux-2.6.8.1-ve022stab032/arch/i386/kernel/ldt.c --- linux-2.6.8.1.orig/arch/i386/kernel/ldt.c 2004-08-14 14:55:47.000000000 +0400 +++ linux-2.6.8.1-ve022stab032/arch/i386/kernel/ldt.c 2005-08-11 19:41:30.000000000 +0400 @@ -2,7 +2,7 @@ * linux/kernel/ldt.c * * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds - * Copyright (C) 1999 Ingo Molnar + * Copyright (C) 1999, 2003 Ingo Molnar */ #include @@ -18,6 +18,8 @@ #include #include #include +#include +#include #ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ static void flush_ldt(void *null) @@ -29,34 +31,31 @@ static void flush_ldt(void *null) static int alloc_ldt(mm_context_t *pc, int mincount, int reload) { - void *oldldt; - void *newldt; - int oldsize; + int oldsize, newsize, i; if (mincount <= pc->size) return 0; + /* + * LDT got larger - reallocate if necessary. + */ oldsize = pc->size; mincount = (mincount+511)&(~511); - if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) - newldt = vmalloc(mincount*LDT_ENTRY_SIZE); - else - newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); - - if (!newldt) - return -ENOMEM; - - if (oldsize) - memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); - oldldt = pc->ldt; - memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); - pc->ldt = newldt; - wmb(); + newsize = mincount*LDT_ENTRY_SIZE; + for (i = 0; i < newsize; i += PAGE_SIZE) { + int nr = i/PAGE_SIZE; + BUG_ON(i >= 64*1024); + if (!pc->ldt_pages[nr]) { + pc->ldt_pages[nr] = alloc_page(GFP_HIGHUSER|__GFP_UBC); + if (!pc->ldt_pages[nr]) + return -ENOMEM; + clear_highpage(pc->ldt_pages[nr]); + } + } pc->size = mincount; - wmb(); - if (reload) { #ifdef CONFIG_SMP cpumask_t mask; + preempt_disable(); load_LDT(pc); mask = cpumask_of_cpu(smp_processor_id()); @@ -67,24 +66,32 @@ static int alloc_ldt(mm_context_t *pc, i load_LDT(pc); #endif } - if (oldsize) { - if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(oldldt); - else - kfree(oldldt); - } return 0; } static inline int copy_ldt(mm_context_t *new, mm_context_t *old) { - int err = alloc_ldt(new, old->size, 0); - if (err < 0) + int i, err, size = old->size, nr_pages = (size*LDT_ENTRY_SIZE + PAGE_SIZE-1)/PAGE_SIZE; + + err = alloc_ldt(new, size, 0); + if (err < 0) { + new->size = 0; return err; - memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); + } + for (i = 0; i < nr_pages; i++) + copy_user_highpage(new->ldt_pages[i], old->ldt_pages[i], 0); return 0; } +static void free_ldt(mm_context_t *mc) +{ + int i; + + for (i = 0; i < MAX_LDT_PAGES; i++) + if (mc->ldt_pages[i]) + __free_page(mc->ldt_pages[i]); +} + /* * we do not have to muck with descriptors here, that is * done in switch_mm() as needed. @@ -96,10 +103,13 @@ int init_new_context(struct task_struct init_MUTEX(&mm->context.sem); mm->context.size = 0; + memset(mm->context.ldt_pages, 0, sizeof(struct page *) * MAX_LDT_PAGES); old_mm = current->mm; if (old_mm && old_mm->context.size > 0) { down(&old_mm->context.sem); retval = copy_ldt(&mm->context, &old_mm->context); + if (retval < 0) + free_ldt(&mm->context); up(&old_mm->context.sem); } return retval; @@ -107,23 +117,21 @@ int init_new_context(struct task_struct /* * No need to lock the MM as we are the last user + * Do not touch the ldt register, we are already + * in the next thread. */ void destroy_context(struct mm_struct *mm) { - if (mm->context.size) { - if (mm == current->active_mm) - clear_LDT(); - if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(mm->context.ldt); - else - kfree(mm->context.ldt); - mm->context.size = 0; - } + int i, nr_pages = (mm->context.size*LDT_ENTRY_SIZE + PAGE_SIZE-1) / PAGE_SIZE; + + for (i = 0; i < nr_pages; i++) + __free_page(mm->context.ldt_pages[i]); + mm->context.size = 0; } static int read_ldt(void __user * ptr, unsigned long bytecount) { - int err; + int err, i; unsigned long size; struct mm_struct * mm = current->mm; @@ -138,8 +146,25 @@ static int read_ldt(void __user * ptr, u size = bytecount; err = 0; - if (copy_to_user(ptr, mm->context.ldt, size)) - err = -EFAULT; + /* + * This is necessary just in case we got here straight from a + * context-switch where the ptes were set but no tlb flush + * was done yet. We rather avoid doing a TLB flush in the + * context-switch path and do it here instead. + */ + __flush_tlb_global(); + + for (i = 0; i < size; i += PAGE_SIZE) { + int nr = i / PAGE_SIZE, bytes; + char *kaddr = kmap(mm->context.ldt_pages[nr]); + + bytes = size - i; + if (bytes > PAGE_SIZE) + bytes = PAGE_SIZE; + if (copy_to_user(ptr + i, kaddr, bytes)) + err = -EFAULT; + kunmap(mm->context.ldt_pages[nr]); + } up(&mm->context.sem); if (err < 0) return err; @@ -158,7 +183,7 @@ static int read_default_ldt(void __user err = 0; address = &default_ldt[0]; - size = 5*sizeof(struct desc_struct); + size = 5*LDT_ENTRY_SIZE; if (size > bytecount) size = bytecount; @@ -200,7 +225,15 @@ static int write_ldt(void __user * ptr, goto out_unlock; } - lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt); + /* + * No rescheduling allowed from this point to the install. + * + * We do a TLB flush for the same reason as in the read_ldt() path. + */ + preempt_disable(); + __flush_tlb_global(); + lp = (__u32 *) ((ldt_info.entry_number << 3) + + (char *) __kmap_atomic_vaddr(KM_LDT_PAGE0)); /* Allow LDTs to be cleared by the user. */ if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { @@ -221,6 +254,7 @@ install: *lp = entry_1; *(lp+1) = entry_2; error = 0; + preempt_enable(); out_unlock: up(&mm->context.sem); @@ -248,3 +282,26 @@ asmlinkage int sys_modify_ldt(int func, } return ret; } + +/* + * load one particular LDT into the current CPU + */ +void load_LDT_nolock(mm_context_t *pc, int cpu) +{ + struct page **pages = pc->ldt_pages; + int count = pc->size; + int nr_pages, i; + + if (likely(!count)) { + pages = &default_ldt_page; + count = 5; + } + nr_pages = (count*LDT_ENTRY_SIZE + PAGE_SIZE-1) / PAGE_SIZE; + + for (i = 0; i < nr_pages; i++) { + __kunmap_atomic_type(KM_LDT_PAGE0 - i); + __kmap_atomic(pages[i], KM_LDT_PAGE0 - i); + } + set_ldt_desc(cpu, (void *)__kmap_atomic_vaddr(KM_LDT_PAGE0), count); + load_LDT_desc(); +} diff -uprN linux-2.6.8.1.orig/arch/i386/kernel/Makefile linux-2.6.8.1-ve022stab032/arch/i386/kernel/Makefile --- linux-2.6.8.1.orig/arch/i386/kernel/Makefile 2004-08-14 14:54:51.000000000 +0400 +++ linux-2.6.8.1-ve022stab032/arch/i386/kernel/Makefile 2005-08-11 19:41:30.000000000 +0400 @@ -7,7 +7,7 @@ extra-y := head.o init_task.o vmlinux.ld obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \ ptrace.o i8259.o ioport.o ldt.o setup.o time.o sys_i386.o \ pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \ - doublefault.o + doublefault.o entry_trampoline.o obj-y += cpu/ obj-y += timers/ diff -uprN linux-2.6.8.1.orig/arch/i386/kernel/mpparse.c linux-2.6.8.1-ve022stab032/arch/i386/kernel/mpparse.c --- linux-2.6.8.1.orig/arch/i386/kernel/mpparse.c 2004-08-14 14:55:10.000000000 +0400 +++ linux-2.6.8.1-ve022stab032/arch/i386/kernel/mpparse.c 2005-08-11 19:41:30.000000000 +0400 @@ -690,7 +690,7 @@ void __init get_smp_config (void) * Read the physical hardware table. Anything here will * override the defaults. */ - if (!smp_read_mpc((void *)mpf->mpf_physptr)) { + if (!smp_read_mpc((void *)phys_to_virt(mpf->mpf_physptr))) { smp_found_config = 0; printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); diff -uprN linux-2.6.8.1.orig/arch/i386/kernel/nmi.c linux-2.6.8.1-ve022stab032/arch/i386/kernel/nmi.c --- linux-2.6.8.1.orig/arch/i386/kernel/nmi.c 2004-08-14 14:55:33.000000000 +0400 +++ linux-2.6.8.1-ve022stab032/arch/i386/kernel/nmi.c 2005-08-11 19:41:30.000000000 +0400 @@ -31,7 +31,12 @@ #include #include -unsigned int nmi_watchdog = NMI_NONE; +#ifdef CONFIG_NMI_WATCHDOG +#define NMI_DEFAULT NMI_IO_APIC +#else +#define NMI_DEFAULT NMI_NONE +#endif +unsigned int nmi_watchdog = NMI_DEFAULT; static unsigned int nmi_hz = HZ; static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ static unsigned int nmi_p4_cccr_val; @@ -459,6 +464,21 @@ void touch_nmi_watchdog (void) alert_counter[i] = 0; } +static spinlock_t show_regs_lock = SPIN_LOCK_UNLOCKED; + +void smp_show_regs(struct pt_regs *regs, void *info) +{ + if (regs == NULL) + return; + + bust_spinlocks(1); + spin_lock(&show_regs_lock); + printk("----------- IPI show regs -----------"); + show_regs(regs); + spin_unlock(&show_regs_lock); + bust_spinlocks(0); +} + void nmi_watchdog_tick (struct pt_regs * regs) { @@ -486,7 +506,8 @@ void nmi_watchdog_tick (struct pt_regs * bust_spinlocks(1); printk("NMI Watchdog detected LOCKUP on CPU%d, eip %08lx, registers:\n", cpu, regs->eip); show_registers(regs); - printk("console shuts up ...\n"); + smp_nmi_call_function(smp_show_regs, NULL, 1); + bust_spinlocks(1); console_silent(); spin_unlock(&nmi_print_lock); bust_spinlocks(0); diff -uprN linux-2.6.8.1.orig/arch/i386/kernel/process.c linux-2.6.8.1-ve022stab032/arch/i386/kernel/process.c --- linux-2.6.8.1.orig/arch/i386/kernel/process.c 2004-08-14 14:54:46.000000000 +0400 +++ linux-2.6.8.1-ve022stab032/arch/i386/kernel/process.c 2005-08-11 19:41:30.000000000 +0400 @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -46,6 +47,7 @@ #include #include #include +#include #ifdef CONFIG_MATH_EMULATION #include #endif @@ -219,10 +221,12 @@ __setup("idle=", idle_setup); void show_regs(struct pt_regs * regs) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; + extern int die_counter; printk("\n"); - printk("Pid: %d, comm: %20s\n", current->pid, current->comm); - printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id()); + printk("Pid: %d, comm: %20s, oopses: %d\n", current->pid, current->comm, die_counter); + printk("EIP: %04x:[<%08lx>] CPU: %d, VCPU: %d:%d\n",0xffff & regs->xcs,regs->eip, smp_processor_id(), + task_vsched_id(current), task_cpu(current)); print_symbol("EIP is at %s\n", regs->eip); if (regs->xcs & 3) @@ -272,6 +276,13 @@ int kernel_thread(int (*fn)(void *), voi { struct pt_regs regs; + /* Don't allow kernel_thread() inside VE */ + if (!ve_is_super(get_exec_env())) { + printk("kernel_thread call inside VE\n"); + dump_stack(); + return -EPERM; + } + memset(®s, 0, sizeof(regs)); regs.ebx = (unsigned long) fn; @@ -311,6 +322,9 @@ void flush_thread(void) struct task_struct *tsk = current; memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); +#ifdef CONFIG_X86_HIGH_ENTRY + clear_thread_flag(TIF_DB7); +#endif memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); /* * Forget coprocessor state.. @@ -324,9 +338,8 @@ void release_thread(struct task_struct * if (dead_task->mm) { // temporary debugging check if (dead_task->mm->context.size) { - printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", + printk("WARNING: dead process %8s still has LDT? <%d>\n", dead_task->comm, - dead_task->mm->context.ldt, dead_task->mm->context.size); BUG(); } @@ -350,7 +363,7 @@ int copy_thread(int nr, unsigned long cl { struct pt_regs * childregs; struct task_struct *tsk; - int err; + int err, i; childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1; *childregs = *regs; @@ -361,7 +374,18 @@ int copy_thread(int nr, unsigned long cl p->thread.esp = (unsigned long) childregs; p->thread.esp0 = (unsigned long) (childregs+1); + /* + * get the two stack pages, for the virtual stack. + * + * IMPORTANT: this code relies on the fact that the task + * structure is an THREAD_SIZE aligned piece of physical memory. + */ + for (i = 0; i < ARRAY_SIZE(p->thread_info->stack_page); i++) + p->thread_info->stack_page[i] = + virt_to_page((unsigned long)p->thread_info + (i*PAGE_SIZE)); + p->thread.eip = (unsigned long) ret_from_fork; + p->thread_info->real_stack = p->thread_info; savesegment(fs,p->thread.fs); savesegment(gs,p->thread.gs); @@ -513,10 +537,42 @@ struct task_struct fastcall * __switch_t __unlazy_fpu(prev_p); +#ifdef CONFIG_X86_HIGH_ENTRY +{ + int i; + /* + * Set the ptes of the virtual stack. (NOTE: a one-page TLB flush is + * needed because otherwise NMIs could interrupt the + * user-return code with a virtual stack and stale TLBs.) + */ + for (i = 0; i < ARRAY_SIZE(next_p->thread_info->stack_page); i++) { + __kunmap_atomic_type(KM_VSTACK_TOP-i); + __kmap_atomic(next_p->thread_info->stack_page[i], KM_VSTACK_TOP-i); + } + /* + * NOTE: here we rely on the task being the stack as well + */ + next_p->thread_info->virtual_stack = + (void *)__kmap_atomic_vaddr(KM_VSTACK_TOP); +} +#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) + /* + * If next was preempted on entry from userspace to kernel, + * and now it's on a different cpu, we need to adjust %esp. + * This assumes that entry.S does not copy %esp while on the + * virtual stack (with interrupts enabled): which is so, + * except within __SWITCH_KERNELSPACE itself. + */ + if (unlikely(next->esp >= TASK_SIZE)) { + next->esp &= THREAD_SIZE - 1; + next->esp |= (unsigned long) next_p->thread_info->virtual_stack; + } +#endif +#endif /* * Reload esp0, LDT and the page table pointer: */ - load_esp0(tss, next); + load_virtual_esp0(tss, next_p); /* * Load the per-thread Thread-Local Storage descriptor. @@ -578,6 +634,13 @@ struct task_struct fastcall * __switch_t asmlinkage int sys_fork(struct pt_regs regs) { + struct faudit_regs_arg arg; + + arg.regs = ®s; + if (virtinfo_notifier_call(VITYPE_FAUDIT, VIRTINFO_FAUDIT_FORK, &arg) + != NOTIFY_DONE) + return arg.err; + return do_fork(SIGCHLD, regs.esp, ®s, 0, NULL, NULL); } @@ -586,6 +649,12 @@ asmlinkage int sys_clone(struct pt_regs unsigned long clone_flags; unsigned long newsp; int __user *parent_tidptr, *child_tidptr; + struct faudit_regs_arg arg; + + arg.regs = ®s; + if (virtinfo_notifier_call(VITYPE_FAUDIT, VIRTINFO_FAUDIT_CLONE, &arg) + != NOTIFY_DONE) + return arg.err; clone_flags = regs.ebx; newsp = regs.ecx; @@ -608,6 +677,13 @@ asmlinkage int sys_clone(struct pt_regs */ asmlinkage int sys_vfork(struct pt_regs regs) { + struct faudit_regs_arg arg; + + arg.regs = ®s; + if (virtinfo_notifier_call(VITYPE_FAUDIT, VIRTINFO_FAUDIT_VFORK, &arg) + != NOTIFY_DONE) + return arg.err; + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0, NULL, NULL); } @@ -618,6 +694,13 @@ asmlinkage int sys_execve(struct pt_regs { int error; char * filename; + struct faudit_regs_arg arg; + + arg.regs = ®s; + if (virtinfo_notifier_call(VITYPE_FAUDIT, VIRTINFO_FAUDIT_EXECVE, &arg) + != NOTIFY_DONE) + return arg.err; + filename = getname((char __user *) regs.ebx); error = PTR_ERR(filename); @@ -759,6 +842,8 @@ asmlinkage int sys_get_thread_area(struc if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) return -EINVAL; + memset(&info, 0, sizeof(info)); + desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; info.entry_number = idx; diff -uprN linux-2.6.8.1.orig/arch/i386/kernel/ptrace.c linux-2.6.8.1-ve022stab032/arch/i386/kernel/ptrace.c --- linux-2.6.8.1.orig/arch/i386/kernel/ptrace.c 2004-08-14 14:55:09.000000000 +0400 +++ linux-2.6.8.1-ve022stab032/arch/i386/kernel/ptrace.c 2005-08-11 19:41:31.000000000 +0400 @@ -253,7 +253,7 @@ asmlinkage int sys_ptrace(long request, } ret = -ESRCH; read_lock(&tasklist_lock); - child = find_task_by_pid(pid); + child = find_task_by_pid_ve(pid); if (child) get_task_struct(child); read_unlock(&tasklist_lock); @@ -388,7 +388,7 @@ asmlinkage int sys_ptrace(long request, long tmp; ret = 0; - if (child->state == TASK_ZOMBIE) /* already dead */ + if (child->exit_state == EXIT_ZOMBIE) /* already dead */ break; child->exit_code = SIGKILL; /* make sure the single step bit is not set. */ @@ -541,8 +541,10 @@ void do_syscall_trace(struct pt_regs *re return; /* the 0x80 provides a way for the tracing parent to distinguish between a syscall stop and SIGTRAP delivery */ + set_pn_state(current, entryexit ? PN_STOP_LEAVE : PN_STOP_ENTRY); ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80 : 0)); + clear_pn_state(current); /* * this isn't the same as continuing with a signal, but it will do diff -uprN linux-2.6.8.1.orig/arch/i386/kernel/reboot.c linux-2.6.8.1-ve022stab032/arch/i386/kernel/reboot.c --- linux-2.6.8.1.orig/arch/i386/kernel/reboot.c 2004-08-14 14:55:09.000000000 +0400 +++ linux-2.6.8.1-ve022stab032/arch/i386/kernel/reboot.c 2005-08-11 19:41:30.000000000 +0400 @@ -233,12 +233,11 @@ void machine_real_restart(unsigned char CMOS_WRITE(0x00, 0x8f); spin_unlock_irqrestore(&rtc_lock, flags); - /* Remap the kernel at virtual address zero, as well as offset zero - from the kernel segment. This assumes the kernel segment starts at - virtual address PAGE_OFFSET. */ - - memcpy (swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, - sizeof (swapper_pg_dir [0]) * KERNEL_PGD_PTRS); + /* + * Remap the first 16 MB of RAM (which includes the kernel image) + * at virtual address zero: + */ + setup_identity_mappings(swapper_pg_dir, 0, LOW_MAPPINGS_SIZE); /* * Use `swapper_pg_dir' as our page directory. diff -uprN linux-2.6.8.1.orig/arch/i386/kernel/setup.c linux-2.6.8.1-ve022stab032/arch/i386/kernel/setup.c --- linux-2.6.8.1.orig/arch/i386/kernel/setup.c 2004-08-14 14:55:32.000000000 +0400 +++ linux-2.6.8.1-ve022stab032/arch/i386/kernel/setup.c 2005-08-11 19:41:30.000000000 +0400 @@ -39,6 +39,7 @@ #include #include #include +#include #include