Index: oldkernel/linux/Documentation/Configure.help diff -u linux/Documentation/Configure.help:1.3 linux/Documentation/Configure.help:1.4 --- linux/Documentation/Configure.help:1.3 Thu Jun 1 14:57:34 2000 +++ linux/Documentation/Configure.help Thu Jun 1 15:05:19 2000 @@ -1659,10 +1659,10 @@ all x86 CPU types (albeit not optimally fast), you can specify "386" here. - If you specify one of "486" or "586" or "Pentium" or "PPro", then - the kernel will not necessarily run on earlier architectures (e.g. a - Pentium optimized kernel will run on a PPro, but not necessarily on - a i486). + If you specify one of "486" or "586" or "Pentium" or "PPro" or "PIII", + then the kernel will not necessarily run on earlier architectures + (e.g. a Pentium optimized kernel will run on a PPro, but not necessarily + on a i486). Here are the settings recommended for greatest speed: - "386" for the AMD/Cyrix/Intel 386DX/DXL/SL/SLC/SX, Cyrix/TI @@ -1676,8 +1676,30 @@ K6-3D. - "PPro" for the Cyrix/IBM/National Semiconductor 6x86MX, MII and Intel Pentium II/Pentium Pro. + - "PIII/Xeon/Deschutes" for the PIII (Katmai), Xeon and later PIIs + with the Deschutes or Mendocino core. You have to chose this for + MMX2 support. If you don't know what to do, choose "386". + +Disable PII/PIII Serial Number at bootup +CONFIG_X86_PN_OFF + This makes the kernel disable the CPUID serial number that is embedded on + the new PIII CPUs at bootup. + +Enable PII/PIII Extended Fast FPU save and restore support +CONFIG_X86_FX + This enables use of the new PII/PIII FXSAVE/FXRSTOR support. This item + is required to make use of the new PIII 128bit XMM registers. It is safe + to leave this enabled all the time. + +Enable CPU Specific (MMX/MMX2) Optimizations +CONFIG_X86_CPU_OPTIMIZATIONS + This enables use of the MMX registers and 128bit MMX2 registers on CPUs + that can support the new instructions (Pentium/AMD K6 or newer). In + order to support the Pentium III 128 bit XMM registers you must enable + both this and PII/PIII Extended Fast FPU save support. It is safe to + leave this enabled all the time. VGA text console CONFIG_VGA_CONSOLE Index: oldkernel/linux/arch/i386/Makefile diff -u linux/arch/i386/Makefile:1.1.1.1 linux/arch/i386/Makefile:1.2 --- linux/arch/i386/Makefile:1.1.1.1 Wed May 31 12:33:53 2000 +++ linux/arch/i386/Makefile Thu Jun 1 15:05:19 2000 @@ -43,6 +43,10 @@ CFLAGS := $(CFLAGS) -m486 -malign-loops=2 -malign-jumps=2 -malign-functions=2 -DCPU=686 endif +ifdef CONFIG_M686FX +CFLAGS := $(CFLAGS) -m486 -malign-loops=0 -malign-jumps=0 -malign-functions=0 -DCPU=686 +endif + HEAD := arch/i386/kernel/head.o arch/i386/kernel/init_task.o SUBDIRS := $(SUBDIRS) arch/i386/kernel arch/i386/mm arch/i386/lib Index: oldkernel/linux/arch/i386/config.in diff -u linux/arch/i386/config.in:1.1.1.1 linux/arch/i386/config.in:1.2 --- linux/arch/i386/config.in:1.1.1.1 Wed May 31 12:33:53 2000 +++ linux/arch/i386/config.in Thu Jun 1 15:05:19 2000 @@ -16,7 +16,8 @@ 486/Cx486 CONFIG_M486 \ 586/K5/5x86/6x86 CONFIG_M586 \ Pentium/K6/TSC CONFIG_M586TSC \ - PPro/6x86MX CONFIG_M686" PPro + PPro/6x86MX/PII CONFIG_M686 \ + PIII/Xeon/Deschutes CONFIG_M686FX" PIII # # Define implied options from the CPU selection here # @@ -26,20 +27,24 @@ define_bool CONFIG_X86_BSWAP y define_bool CONFIG_X86_POPAD_OK y fi -if [ "$CONFIG_M686" = "y" -o "$CONFIG_M586TSC" = "y" ]; then +if [ "$CONFIG_M686FX" = "y" -o "$CONFIG_M686" = "y" \ + -o "$CONFIG_M586TSC" = "y" ]; then define_bool CONFIG_X86_TSC y fi -if [ "$CONFIG_M686" = "y" ]; then +if [ "$CONFIG_M686FX" = "y" -o "$CONFIG_M686" = "y" ]; then define_bool CONFIG_X86_GOOD_APIC y fi +bool 'Disable the PII/PIII Serial Number at bootup' CONFIG_X86_PN_OFF +bool 'Enable PII/PIII Extended/Fast FPU save and restore support' CONFIG_X86_FX +bool 'Enable CPU Specific (MMX/MMX2) Optimization Functions' CONFIG_X86_CPU_OPTIMIZATIONS +bool 'Math emulation' CONFIG_MATH_EMULATION +bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR +bool 'Symmetric multi-processing support' CONFIG_SMP choice 'Maximum Physical Memory' \ "1GB CONFIG_1GB \ 2GB CONFIG_2GB" 1GB -bool 'Math emulation' CONFIG_MATH_EMULATION -bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR -bool 'Symmetric multi-processing support' CONFIG_SMP endmenu mainmenu_option next_comment Index: oldkernel/linux/arch/i386/defconfig diff -u linux/arch/i386/defconfig:1.2 linux/arch/i386/defconfig:1.3 --- linux/arch/i386/defconfig:1.2 Thu Jun 1 14:51:28 2000 +++ linux/arch/i386/defconfig Thu Jun 1 15:05:19 2000 @@ -21,11 +21,14 @@ CONFIG_X86_POPAD_OK=y CONFIG_X86_TSC=y CONFIG_X86_GOOD_APIC=y -CONFIG_1GB=y -# CONFIG_2GB is not set +CONFIG_X86_PN_OFF=y +CONFIG_X86_FX=y +CONFIG_X86_CPU_OPTIMIZATIONS=y # CONFIG_MATH_EMULATION is not set # CONFIG_MTRR is not set CONFIG_SMP=y +CONFIG_1GB=y +# CONFIG_2GB is not set # # Loadable module support Index: oldkernel/linux/arch/i386/kernel/head.S diff -u linux/arch/i386/kernel/head.S:1.1.1.1 linux/arch/i386/kernel/head.S:1.2 --- linux/arch/i386/kernel/head.S:1.1.1.1 Wed May 31 12:33:53 2000 +++ linux/arch/i386/kernel/head.S Thu Jun 1 15:05:19 2000 @@ -14,7 +14,6 @@ #include #include - #define CL_MAGIC_ADDR 0x90020 #define CL_MAGIC 0xA33F #define CL_BASE_ADDR 0x90000 @@ -32,7 +31,8 @@ #define X86_HARD_MATH CPU_PARAMS+6 #define X86_CPUID CPU_PARAMS+8 #define X86_CAPABILITY CPU_PARAMS+12 -#define X86_VENDOR_ID CPU_PARAMS+16 +#define X86_MMU_CR4 CPU_PARAMS+16 +#define X86_VENDOR_ID CPU_PARAMS+20 /* * swapper_pg_dir is the main page directory, address 0x00101000 @@ -59,9 +59,8 @@ * NOTE! We have to correct for the fact that we're * not yet offset PAGE_OFFSET.. */ -#define cr4_bits mmu_cr4_features-__PAGE_OFFSET movl %cr4,%eax # Turn on 4Mb pages - orl cr4_bits,%eax + orl X86_MMU_CR4-__PAGE_OFFSET,%eax movl %eax,%cr4 #endif /* Index: oldkernel/linux/arch/i386/kernel/i386_ksyms.c diff -u linux/arch/i386/kernel/i386_ksyms.c:1.1.1.1 linux/arch/i386/kernel/i386_ksyms.c:1.2 --- linux/arch/i386/kernel/i386_ksyms.c:1.1.1.1 Wed May 31 12:33:53 2000 +++ linux/arch/i386/kernel/i386_ksyms.c Thu Jun 1 15:05:19 2000 @@ -119,3 +119,13 @@ #ifdef CONFIG_VT EXPORT_SYMBOL(screen_info); #endif + +#ifdef CONFIG_X86_CPU_OPTIMIZATIONS +EXPORT_SYMBOL(best_memcpy); +EXPORT_SYMBOL(best_memset); +EXPORT_SYMBOL(best_copy_to_user); +EXPORT_SYMBOL(best_copy_from_user); +EXPORT_SYMBOL(__best_copy_to_user); +EXPORT_SYMBOL(__best_copy_from_user); +#endif + Index: oldkernel/linux/arch/i386/kernel/process.c diff -u linux/arch/i386/kernel/process.c:1.1.1.1 linux/arch/i386/kernel/process.c:1.2 --- linux/arch/i386/kernel/process.c:1.1.1.1 Wed May 31 12:33:53 2000 +++ linux/arch/i386/kernel/process.c Thu Jun 1 15:05:19 2000 @@ -42,6 +42,7 @@ #include #include #include +#include #ifdef CONFIG_MATH_EMULATION #include #endif @@ -582,6 +583,106 @@ } /* + * FPU state handling functions + */ + +int i387_hard_to_user ( struct user_i387_struct * user, + union i387_hard_union * hard) +{ +#ifdef CONFIG_X86_FX + int i, err = 0; + short *tmp, *tmp2; + union i387_hard_union hard2; +#else + int err = 0; +#endif + + if (!access_ok(VERIFY_WRITE, user, sizeof(*user))) + return -EFAULT; +#ifdef CONFIG_X86_FX + if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { + hard2.fsave.cwd = 0xffff0000 | hard->fxsave.fxcwd; + hard2.fsave.swd = 0xffff0000 | hard->fxsave.fxswd; + hard2.fsave.twd = fputag_KNI_to_387(hard->fxsave.fxtwd); + hard2.fsave.fip = hard->fxsave.fxfip; + hard2.fsave.fcs = hard->fxsave.fxfcs; + hard2.fsave.foo = hard->fxsave.fxfoo; + hard2.fsave.fos = hard->fxsave.fxfos; + + tmp = (short *)&hard2.fsave.st_space[0]; + tmp2 = (short *)&hard->fxsave.st_space[0]; + + /* + * Transform the two layouts: + * (we do not mix 32-bit access with 16-bit access because + * thats suboptimal on PPros) + */ + + for (i = 0; i < 8; i++) { + *tmp = *tmp2; tmp++; tmp2++; + *tmp = *tmp2; tmp++; tmp2++; + *tmp = *tmp2; tmp++; tmp2++; + *tmp = *tmp2; tmp++; tmp2++; + *tmp = *tmp2; tmp++; tmp2 += 4; + } + err = copy_to_user((void *)(user),(&(hard2)), + sizeof(struct i387_hard_fsave)); + } else +#endif + err = copy_to_user((void *)(user), + (&(hard->fsave.cwd)), + sizeof(struct i387_hard_fsave)); + return err; +} + +int i387_user_to_hard (union i387_hard_union * hard, + struct user_i387_struct * user) +{ +#ifdef CONFIG_X86_FX + int i, err = 0; + short *tmp, *tmp2; + union i387_hard_union hard2; +#else + int err = 0; +#endif + + if (!access_ok(VERIFY_READ, user, sizeof(*user))) + return -EFAULT; +#ifdef CONFIG_X86_FX + if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { + err = copy_from_user((&(hard2)),(void *)(user), + sizeof(struct i387_hard_fsave)); + hard->fxsave.fxcwd = hard2.fsave.cwd & 0xffff; + hard->fxsave.fxswd = hard2.fsave.swd & 0xffff; + hard->fxsave.fxtwd = fputag_387_to_KNI(hard2.fsave.twd); + hard->fxsave.fxfip = hard2.fsave.fip; + hard->fxsave.fxfcs = hard2.fsave.fcs & 0xffff; + hard->fxsave.fxfoo = hard2.fsave.foo; + hard->fxsave.fxfos = hard2.fsave.fos & 0xffff; + + tmp2 = (short *)&hard->fxsave.st_space[0]; + tmp = (short *)&hard2.fsave.st_space[0]; + + for (i = 0; i < 8; i++) { + *tmp2 = *tmp; tmp++; tmp2++; + *tmp2 = *tmp; tmp++; tmp2++; + *tmp2 = *tmp; tmp++; tmp2++; + *tmp2 = *tmp; tmp++; tmp2++; + *tmp2 = *tmp; tmp++; tmp2++; + *tmp2 = 0; tmp2++; + *tmp2 = 0; tmp2++; + *tmp2 = 0; tmp2++; + } + } else +#endif + err = copy_from_user((&(hard->fsave.cwd)), + (void *)(user), + sizeof(struct i387_hard_fsave)); + return err; +} + + +/* * Save a segment. */ #define savesegment(seg,value) \ @@ -626,13 +727,43 @@ */ int dump_fpu (struct pt_regs * regs, struct user_i387_struct* fpu) { +#ifdef CONFIG_X86_FX + int fpvalid, i; + short *tmp, *tmp2; + struct task_struct *tsk = current; + union i387_hard_union *hard; +#else int fpvalid; struct task_struct *tsk = current; - +#endif fpvalid = tsk->used_math; if (fpvalid) { unlazy_fpu(tsk); - memcpy(fpu,&tsk->tss.i387.hard,sizeof(*fpu)); +#ifdef CONFIG_X86_FX + if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { + hard = &tsk->tss.i387.hard; + + fpu->cwd = 0xffff0000 | hard->fxsave.fxcwd; + fpu->swd = 0xffff0000 | hard->fxsave.fxswd; + fpu->twd = fputag_KNI_to_387(hard->fxsave.fxtwd); + fpu->fip = hard->fxsave.fxfip; + fpu->fcs = hard->fxsave.fxfcs; + fpu->foo = hard->fxsave.fxfoo; + fpu->fos = hard->fxsave.fxfos; + + tmp = (short *)&fpu->st_space[0]; + tmp2 = (short *)&hard->fxsave.st_space[0]; + + for (i = 0; i < 8; i++) { + *tmp = *tmp2; tmp++; tmp2++; + *tmp = *tmp2; tmp++; tmp2++; + *tmp = *tmp2; tmp++; tmp2++; + *tmp = *tmp2; tmp++; tmp2++; + *tmp = *tmp2; tmp++; tmp2+=4; + } + } else +#endif + memcpy(fpu,&tsk->tss.i387.hard.fsave,sizeof(*fpu)); } return fpvalid; @@ -692,8 +823,8 @@ /* * switch_to(x,yn) should switch tasks from x to y. * - * We fsave/fwait so that an exception goes off at the right time - * (as a call from the fsave or fwait in effect) rather than to + * We fpu_save so that an exception goes off at the right time + * (as a call from the f*save or fwait in effect) rather than to * the wrong process. Lazy FP saving no longer makes any sense * with modern CPU's, and this simplifies a lot of things (SMP * and UP become the same). Index: oldkernel/linux/arch/i386/kernel/ptrace.c diff -u linux/arch/i386/kernel/ptrace.c:1.1.1.1 linux/arch/i386/kernel/ptrace.c:1.2 --- linux/arch/i386/kernel/ptrace.c:1.1.1.1 Wed May 31 12:33:53 2000 +++ linux/arch/i386/kernel/ptrace.c Thu Jun 1 15:05:19 2000 @@ -17,6 +17,7 @@ #include #include #include +#include /* * does not yet catch signals sent when the child dies. @@ -646,6 +647,9 @@ }; case PTRACE_GETFPREGS: { /* Get the child FPU state. */ + /* + * user-space expects an 'old-style' FPU dump. + */ if (!access_ok(VERIFY_WRITE, (unsigned *)data, sizeof(struct user_i387_struct))) { @@ -655,15 +659,17 @@ ret = 0; if ( !child->used_math ) { /* Simulate an empty FPU. */ - child->tss.i387.hard.cwd = 0xffff037f; - child->tss.i387.hard.swd = 0xffff0000; - child->tss.i387.hard.twd = 0xffffffff; + i387_set_cwd(child->tss.i387.hard, 0x037f); + i387_set_swd(child->tss.i387.hard, 0x0000); + i387_set_twd(child->tss.i387.hard, 0xffff); } #ifdef CONFIG_MATH_EMULATION if ( boot_cpu_data.hard_math ) { #endif - __copy_to_user((void *)data, &child->tss.i387.hard, - sizeof(struct user_i387_struct)); + i387_hard_to_user( + (struct user_i387_struct *)data, + &child->tss.i387.hard + ); #ifdef CONFIG_MATH_EMULATION } else { save_i387_soft(&child->tss.i387.soft, @@ -684,8 +690,10 @@ #ifdef CONFIG_MATH_EMULATION if ( boot_cpu_data.hard_math ) { #endif - __copy_from_user(&child->tss.i387.hard, (void *)data, - sizeof(struct user_i387_struct)); + i387_user_to_hard( + &child->tss.i387.hard, + (struct user_i387_struct *)data + ); #ifdef CONFIG_MATH_EMULATION } else { restore_i387_soft(&child->tss.i387.soft, Index: oldkernel/linux/arch/i386/kernel/setup.c diff -u linux/arch/i386/kernel/setup.c:1.1.1.1 linux/arch/i386/kernel/setup.c:1.2 --- linux/arch/i386/kernel/setup.c:1.1.1.1 Wed May 31 12:33:53 2000 +++ linux/arch/i386/kernel/setup.c Thu Jun 1 15:05:19 2000 @@ -104,6 +104,17 @@ extern int _etext, _edata, _end; extern unsigned long cpu_hz; +#ifdef CONFIG_X86_PN_OFF +int disable_x86_serial_nr = 1; +#else +int disable_x86_serial_nr = 0; +#endif + +/* + * For the various FPU using kernel accelerator routines + */ +spinlock_t kern_fpu_lock = SPIN_LOCK_UNLOCKED; + /* * This is set up by the setup-routine at boot-time */ @@ -809,20 +820,6 @@ if (c->x86_vendor == X86_VENDOR_AMD && amd_model(c)) return; - - if (c->cpuid_level > 0 && c->x86_vendor == X86_VENDOR_INTEL) - { - if(c->x86_capability&(1<<18)) - { - /* Disable processor serial number on Intel Pentium III - from code by Phil Karn */ - unsigned long lo,hi; - rdmsr(0x119,lo,hi); - lo |= 0x200000; - wrmsr(0x119,lo,hi); - printk(KERN_INFO "Pentium-III serial number disabled.\n"); - } - } if (c->cpuid_level > 1) { /* supports eax=2 call */ @@ -909,7 +906,15 @@ } cyrix_model(&boot_cpu_data); } - + +/* + * Setup function for serial number stuff + */ + +__initfunc(void x86_serial_nr_setup(char *str, int *ints)) +{ + disable_x86_serial_nr = !disable_x86_serial_nr; +} static char *cpu_vendor_names[] __initdata = { Index: oldkernel/linux/arch/i386/kernel/signal.c diff -u linux/arch/i386/kernel/signal.c:1.1.1.1 linux/arch/i386/kernel/signal.c:1.2 --- linux/arch/i386/kernel/signal.c:1.1.1.1 Wed May 31 12:33:53 2000 +++ linux/arch/i386/kernel/signal.c Thu Jun 1 15:05:19 2000 @@ -21,6 +21,7 @@ #include #include #include +#include #define DEBUG_SIG 0 @@ -153,9 +154,14 @@ static inline int restore_i387_hard(struct _fpstate *buf) { + int err = 0; struct task_struct *tsk = current; clear_fpu(tsk); - return __copy_from_user(&tsk->tss.i387.hard, buf, sizeof(*buf)); + + err = i387_user_to_hard(&tsk->tss.i387.hard, + (struct user_i387_struct *)buf); + err |= get_user(tsk->tss.i387.hard.fsave.swd, &buf->status); + return err; } static inline int restore_i387(struct _fpstate *buf) @@ -305,11 +311,14 @@ static inline int save_i387_hard(struct _fpstate * buf) { + int err = 0; struct task_struct *tsk = current; unlazy_fpu(tsk); - tsk->tss.i387.hard.status = tsk->tss.i387.hard.swd; - if (__copy_to_user(buf, &tsk->tss.i387.hard, sizeof(*buf))) + err = i387_hard_to_user((struct user_i387_struct *)buf, + &tsk->tss.i387.hard); + err |= put_user(tsk->tss.i387.hard.fsave.swd, &buf->status); + if (err) return -1; return 1; } Index: oldkernel/linux/arch/i386/kernel/smp.c diff -u linux/arch/i386/kernel/smp.c:1.1.1.1 linux/arch/i386/kernel/smp.c:1.2 --- linux/arch/i386/kernel/smp.c:1.1.1.1 Wed May 31 12:33:53 2000 +++ linux/arch/i386/kernel/smp.c Thu Jun 1 15:05:19 2000 @@ -891,6 +891,8 @@ */ int __init start_secondary(void *unused) { + disable_serial_nr(); + load_default_mxcsr(); /* * Dont put anything before smp_callin(), SMP * booting is too fragile that we want to limit the Index: oldkernel/linux/arch/i386/kernel/traps.c diff -u linux/arch/i386/kernel/traps.c:1.1.1.1 linux/arch/i386/kernel/traps.c:1.2 --- linux/arch/i386/kernel/traps.c:1.1.1.1 Wed May 31 12:33:53 2000 +++ linux/arch/i386/kernel/traps.c Thu Jun 1 15:05:19 2000 @@ -33,6 +33,7 @@ #include #include #include +#include #include @@ -421,7 +422,9 @@ * (this will also clear the error) */ task = current; - save_fpu(task); + i387_save_hard(task->tss.i387); + task->flags &= ~PF_USEDFPU; + stts(); task->tss.trap_no = 16; task->tss.error_code = 0; force_sig(SIGFPE, task); @@ -452,17 +455,44 @@ asmlinkage void math_state_restore(struct pt_regs regs) { __asm__ __volatile__("clts"); /* Allow maths ops (or we recurse) */ - if(current->used_math) - __asm__("frstor %0": :"m" (current->tss.i387)); - else - { + /* + * If we have either of the kernel FPU use states set in the + * fpustate variable, then this will be a kernel math trap. + * Otherwise, this is userspace trying to use the FPU. + */ + if(current->tss.x86_fpustate & X86_FPUSTATE_KERN_ANY) { + load_default_mxcsr(); /* we don't ever mess with this in + kernel space, so just make sure + we have a reasonable one so we + don't start taking unmasked + exceptions by accident */ + if(current->tss.mmx_reg_space != NULL) + __asm__("movq 0x00(%0), %%mm0\n\t" + "movq 0x08(%0), %%mm1\n\t" + "movq 0x10(%0), %%mm2\n\t" + "movq 0x18(%0), %%mm3\n\t" + :: "r" (current->tss.mmx_reg_space)); + if(current->tss.kni_reg_space != NULL) + __asm__("movups 0x00(%0), %%xmm0\n\t" + "movups 0x10(%0), %%xmm1\n\t" + "movups 0x20(%0), %%xmm2\n\t" + "movups 0x30(%0), %%xmm3\n\t" + :: "r" (current->tss.kni_reg_space)); + } else if(current->tss.x86_fpustate & X86_FPUSTATE_USER_SAVED) { + i387_restore_hard(current->tss.i387); + current->tss.x86_fpustate = 0; + } else if(current->used_math) { + i387_restore_hard(current->tss.i387); + current->flags|=PF_USEDFPU; /* make switch_to() work */ + } else { /* * Our first FPU usage, clean the chip. */ __asm__("fninit"); + load_default_mxcsr(); current->used_math = 1; + current->flags|=PF_USEDFPU; /* make switch_to() work */ } - current->flags|=PF_USEDFPU; /* So we fnsave on switch_to() */ } #ifndef CONFIG_MATH_EMULATION Index: oldkernel/linux/arch/i386/lib/Makefile diff -u linux/arch/i386/lib/Makefile:1.1.1.1 linux/arch/i386/lib/Makefile:1.2 --- linux/arch/i386/lib/Makefile:1.1.1.1 Wed May 31 12:33:53 2000 +++ linux/arch/i386/lib/Makefile Thu Jun 1 15:05:19 2000 @@ -9,4 +9,8 @@ L_OBJS = checksum.o old-checksum.o semaphore.o delay.o \ usercopy.o getuser.o putuser.o +ifeq ($(CONFIG_X86_CPU_OPTIMIZATIONS),y) + L_OBJS += best_function.o simd.o +endif + include $(TOPDIR)/Rules.make Index: oldkernel/linux/arch/i386/lib/best_function.c diff -u /dev/null linux/arch/i386/lib/best_function.c:1.1 --- /dev/null Mon Jul 31 21:12:24 2000 +++ linux/arch/i386/lib/best_function.c Thu Jun 1 15:05:19 2000 @@ -0,0 +1,196 @@ +/* + * SIMD functions. These replace the functions in asm-i386/string.h + * whenever it makes sense. These also un-inline those functions. + * + * Copyright 1999, Doug Ledford + * + * These functions are simple and trivial, consider them to be + * public domain + */ + +#include +#include +#include +#include + +/* + * We declare our accelerator functions here since this is the only place + * that needs the declarations which makes a header file a pain to deal + * with + */ +extern void * kni_memcpy(void *, const void *, size_t); +extern void * kni_memset(void *, char, size_t); +extern unsigned long kni_copy_to_user(void *, const void *, unsigned long); +extern unsigned long kni_copy_from_user(void *, const void *, unsigned long); +extern unsigned long __kni_copy_to_user_nocheck(void *, const void *, unsigned long); +extern unsigned long __kni_copy_from_user_nocheck(void *, const void *, unsigned long); + +static void * best_memcpy_final(void *, const void *, size_t); +static void * best_memset_final(void *, char, size_t); +static unsigned long best_copy_to_user_final(void *, const void *, unsigned long); +static unsigned long best_copy_from_user_final(void *, const void *, unsigned long); +static unsigned long __best_copy_to_user_final(void *, const void *, unsigned long); +static unsigned long __best_copy_from_user_final(void *, const void *, unsigned long); + +void * best_memcpy(void * to, const void * from, size_t n) +{ + int BAR = (int)__builtin_return_address(0); + int *caller = (int *)BAR - 1; + if(boot_cpu_data.enable_fixups) { + if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) && + (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) { + *caller = (int)kni_memcpy - BAR; + return(kni_memcpy(to, from, n)); + } else { + *caller = (int)best_memcpy_final - BAR; + return(__memcpy(to, from, n)); + } + } else { + return(__memcpy(to, from, n)); + } +} + +static void * best_memcpy_final(void * to, const void * from, size_t n) +{ + return(__memcpy(to, from, n)); +} + +void * best_memset(void * s, char c, size_t count) +{ + int BAR = (int)__builtin_return_address(0); + int *caller = (int *)BAR - 1; + if(boot_cpu_data.enable_fixups) { + if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) && + (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) { + *caller = (int)kni_memset - BAR; + return(kni_memset(s, c, count)); + } else { + *caller = (int)best_memset_final - BAR; + return(__memset_generic(s, c, count)); + } + } else { + return(__memset_generic(s, c, count)); + } +} + +static void * best_memset_final(void * s, char c, size_t count) +{ + return(__memset_generic(s, c, count)); +} + +unsigned long +best_copy_to_user(void *to, const void *from, unsigned long n) +{ + int BAR = (int)__builtin_return_address(0); + int *caller = (int *)BAR - 1; + if(boot_cpu_data.enable_fixups) { + if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) && + (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) { + *caller = (int)kni_copy_to_user - BAR; + return(kni_copy_to_user(to, from, n)); + } else { + *caller = (int)best_copy_to_user_final - BAR; + return(best_copy_to_user_final(to, from, n)); + } + } else { + if (access_ok(VERIFY_WRITE, to, n)) { + __copy_user(to,from,n); + } + return n; + } +} + +static unsigned long +best_copy_to_user_final(void *to, const void *from, unsigned long n) +{ + if (access_ok(VERIFY_WRITE, to, n)) { + __copy_user(to,from,n); + } + return n; +} + +unsigned long +best_copy_from_user(void *to, const void *from, unsigned long n) +{ + int BAR = (int)__builtin_return_address(0); + int *caller = (int *)BAR - 1; + if(boot_cpu_data.enable_fixups) { + if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) && + (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) { + *caller = (int)kni_copy_from_user - BAR; + return(kni_copy_from_user(to, from, n)); + } else { + *caller = (int)best_copy_from_user_final - BAR; + return(best_copy_from_user_final(to, from, n)); + } + } else { + if (access_ok(VERIFY_READ, from, n)) { + __copy_user_zeroing(to,from,n); + } + return n; + } +} + +static unsigned long +best_copy_from_user_final(void *to, const void *from, unsigned long n) +{ + if (access_ok(VERIFY_READ, from, n)) { + __copy_user_zeroing(to,from,n); + } + return n; +} + +unsigned long +__best_copy_to_user(void *to, const void *from, unsigned long n) +{ + int BAR = (int)__builtin_return_address(0); + int *caller = (int *)BAR - 1; + if(boot_cpu_data.enable_fixups) { + if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) && + (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) { + *caller = (int)__kni_copy_to_user_nocheck - BAR; + return(__kni_copy_to_user_nocheck(to, from, n)); + } else { + *caller = (int)__best_copy_to_user_final - BAR; + return(__best_copy_to_user_final(to, from, n)); + } + } else { + __copy_user(to,from,n); + return n; + } +} + +static unsigned long +__best_copy_to_user_final(void *to, const void *from, unsigned long n) +{ + __copy_user(to,from,n); + return n; +} + +unsigned long +__best_copy_from_user(void *to, const void *from, unsigned long n) +{ + int BAR = (int)__builtin_return_address(0); + int *caller = (int *)BAR - 1; + if(boot_cpu_data.enable_fixups) { + if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) && + (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) { + *caller = (int)__kni_copy_from_user_nocheck - BAR; + return(__kni_copy_from_user_nocheck(to, from, n)); + } else { + *caller = (int)__best_copy_from_user_final - BAR; + return(__best_copy_from_user_final(to, from, n)); + } + } else { + __copy_user_zeroing(to,from,n); + return n; + } +} + +static unsigned long +__best_copy_from_user_final(void *to, const void *from, unsigned long n) +{ + __copy_user_zeroing(to,from,n); + return n; +} + Index: oldkernel/linux/arch/i386/lib/simd.c diff -u /dev/null linux/arch/i386/lib/simd.c:1.1 --- /dev/null Mon Jul 31 21:12:24 2000 +++ linux/arch/i386/lib/simd.c Thu Jun 1 15:05:19 2000 @@ -0,0 +1,435 @@ +/* + * SIMD functions. These replace the functions in asm-i386/string.h + * whenever it makes sense. These also un-inline those functions. + * + * Copyright 1999, Doug Ledford + * + * These functions are simple and trivial, consider them to be + * public domain + */ + +#include +#include +#include +#include +#include + +extern void * kni_memcpy(void * to, const void * from, size_t n) +{ + unsigned long flags; + void *ret=to; + size_t size; + int recursive = 0; + char xmm_space[64]; + + /* + * If the transfer is too small, then use the generic routine. + */ + if (n < 128) { + return(__memcpy(to, from, n)); + } + kernel_take_fpu_kni(recursive,&xmm_space[0],NULL,flags); + + /* + * Align the destination on a 16byte boundary. + * The source doesn't have to be aligned. + */ + if ( (unsigned long)to & 0xf ) { + size = 0x10 - ((unsigned long)to & 0xf); + __asm__ __volatile__("movups (%0),%%xmm0\n\t" + "movups %%xmm0,(%1)\n\t" + : + : "r" (from), + "r" (to)); + n -= size; + from += size; + to += size; + } + /* + * If the copy would have tailings, take care of them + * now instead of later + */ + if(n & 0xf) { + size = n - 0x10; + __asm__ __volatile__("movups (%0),%%xmm0\n\t" + "movups %%xmm0,(%1)\n\t" + : + : "r" (from + size), + "r" (to + size)); + n &= ~0xf; + } + /* + * Prefetch the first two cachelines now. + */ + __asm__ __volatile__("prefetchnta 0x00(%0)\n\t" + "prefetchnta 0x20(%0)\n\t" + : + : "r" (from)); + /* + * Copy 32 bytes at a time. The single unroll is good + * for a 30% performance boost in the copy. Additional + * unrolls are not productive. We are guaranteed to + * have at least 32 bytes of data to copy since the + * macro in string.h doesn't call into this function + * with less than 64 bytes of copy and we lost < 32 + * bytes to alignment earlier. + */ + while (n >= 0x20) { + __asm__ __volatile__( + "movups 0x00(%0),%%xmm0\n\t" + "movups 0x10(%0),%%xmm1\n\t" + "movntps %%xmm0,0x00(%1)\n\t" + "movntps %%xmm1,0x10(%1)\n\t" + : + : "r" (from), "r" (to) + : "memory"); + from += 0x20; + /* + * Note: Intermixing the prefetch at *exactly* this point + * in time has been shown to be the fastest possible. + * Timing these prefetch instructions is a complete black + * art with nothing but trial and error showing the way. + * To that extent, this optimum version was found by using + * a userland version of this routine that we clocked for + * lots of runs. We then fiddled with ordering until we + * settled on our highest speen routines. So, the long + * and short of this is, don't mess with instruction ordering + * here or suffer permance penalties you will. + */ + __asm__ __volatile__( + "prefetchnta 0x20(%0)\n\t" + : + : "r" (from)); + to += 0x20; + n -= 0x20; + } + if (n) { + __asm__ __volatile__("movups 0x00(%0),%%xmm0\n\t" + "movntps %%xmm0,0x00(%1)\n\t" + : + : "r" (from), "r" (to) + : "memory"); + } + SFENCE(); + kernel_release_fpu_kni(recursive,&xmm_space[0],flags); + return(ret); +} + +extern void * kni_memset(void * s, char c, size_t count) +{ + unsigned long flags; + size_t size; + void *ret=s; + int recursive = 0; + char xmm_space[64]; + + /* + * If the transfer is too small, then use the generic routine. + */ + if (count < 128) { + return(__memset_generic(s, c, count)); + } + kernel_take_fpu_kni(recursive,&xmm_space[0],NULL,flags); + /* + * Load up our XMM register with the stuff to set mem with + */ + if(c == '\0') { + __asm__ __volatile__("xorps %%xmm0,%%xmm0\n\t" + "movups %%xmm0,(%0)\n\t" + : + : "r" (s)); + } else { + __memset_generic(s, c, 0x10); + __asm__ __volatile__("movups (%0),%%xmm0" + : + : "r" (s)); + } + /* + * align the destination on a 16 byte boundary, we can simply + * do the math to align things since we already populated the + * first 16 bytes. + */ + size = (0x10 - ((unsigned long)s & 0xf)); + count -= size; + s += size; + /* + * On the off chance we have tailings due to alignment issues, + * do them now to make later more efficient + */ + if(count & 0xf) { + __asm__ __volatile__("movups %%xmm0,(%0)" + : + : "r" (s + (count - 0x10)) + : "memory"); + count &= ~0xf; + } + /* + * Do the copy by plopping out the register to memory. + * Note: Unrolling this was *totally* unproductive. My benchmark + * showed that one or two plops per iteration produced the same + * speed to within .06 MByte/s of speed. Considering that the + * routine benchmarked at over 3000 MByte/s, .06 is not statistically + * significant and only doing one drop per loop simplifies + * overhead of book keeping. + */ + while(count) { + __asm__ __volatile__("movntps %%xmm0,0x00(%0)\n\t" + : + : "r" (s)); + s += 0x10; + count -= 0x10; + } + SFENCE(); + kernel_release_fpu_kni(recursive,&xmm_space[0],flags); + return(ret); +} + +#define __kni_copy_to_user(to,from,size) \ +do { \ + int __d0, __d1, tmp, tmp2; \ + __asm__ __volatile__( \ + " movl %1,%4\n" \ + " andl $0xf,%4\n" \ + " movups (%2),%%xmm0\n" \ + "1: movups %%xmm0,(%1)\n" \ + " movl $0x10,%3\n" \ + " subl %4,%3\n" \ + " addl %3,%2\n" \ + " addl %3,%1\n" \ + " subl %3,%0\n" \ + " prefetchnta 0x00(%2)\n" \ + " prefetchnta 0x20(%2)\n" \ + " jmp 200f\n" \ + "100: movups 0x00(%2),%%xmm0\n" \ + " movups 0x10(%2),%%xmm1\n" \ + "2: movntps %%xmm0,0x00(%1)\n" \ + "3: movntps %%xmm1,0x10(%1)\n" \ + " addl $0x20,%2\n" \ + " prefetchnta 0x20(%2)\n" \ + " addl $0x20,%1\n" \ + " subl $0x20,%0\n" \ + "200: cmpl $0x1f,%0\n" \ + " ja 100b\n" \ + " cmpl $0xf,%0\n" \ + " jbe 300f\n" \ + " movups 0x00(%2),%%xmm0\n" \ + "4: movntps %%xmm0,0x00(%1)\n" \ + " addl $0x10,%2\n" \ + " addl $0x10,%1\n" \ + " subl $0x10,%0\n" \ + "300: testl %0,%0\n" \ + " je 400f\n" \ + " movl $0x10,%3\n" \ + " subl %0,%3\n" \ + " subl %3,%1\n" \ + " subl %3,%2\n" \ + " movups 0x00(%2),%%xmm0\n" \ + "5: movups %%xmm0,0x00(%1)\n" \ + " addl $0x10,%2\n" \ + " addl $0x10,%1\n" \ + " xorl %0,%0\n" \ + "400:\n" \ + ".section .fixup,\"ax\"\n" \ + "6: jmp 400b\n" \ + "7: addl $0x10,%1\n" \ + " addl $0x10,%2\n" \ + " subl $0x10,%0\n" \ + " jmp 400b\n" \ + "8: addl %3,%1\n" \ + " addl %3,%2\n" \ + " jmp 400b\n" \ + ".previous\n" \ + ".section __ex_table,\"a\"\n" \ + " .align 4\n" \ + " .long 1b,6b\n" \ + " .long 2b,6b\n" \ + " .long 3b,7b\n" \ + " .long 4b,6b\n" \ + " .long 5b,8b\n" \ + ".previous" \ + : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(tmp), \ + "=r"(tmp2) \ + : "0"(size), "1"(to), "2"(from) \ + : "memory"); \ +} while (0) + +#define __kni_copy_from_user(to,from,size) \ +do { \ + int __d0, __d1, tmp, tmp2; \ + __asm__ __volatile__( \ + " movl %1,%4\n" \ + " andl $0xf,%4\n" \ + "1: movups (%2),%%xmm0\n" \ + " movups %%xmm0,(%1)\n" \ + " movl $0x10,%3\n" \ + " subl %4,%3\n" \ + " addl %3,%2\n" \ + " addl %3,%1\n" \ + " subl %3,%0\n" \ + " prefetchnta 0x00(%2)\n" \ + " prefetchnta 0x20(%2)\n" \ + " jmp 100f\n" \ + "2: movups 0x00(%2),%%xmm0\n" \ + "3: movups 0x10(%2),%%xmm1\n" \ + " movntps %%xmm0,0x00(%1)\n" \ + " movntps %%xmm1,0x10(%1)\n" \ + " addl $0x20,%2\n" \ + " prefetchnta 0x20(%2)\n" \ + " addl $0x20,%1\n" \ + " subl $0x20,%0\n" \ + "100: cmpl $0x1f,%0\n" \ + " ja 2b\n" \ + " cmpl $0xf,%0\n" \ + " jbe 200f\n" \ + "4: movups 0x00(%2),%%xmm0\n" \ + " movntps %%xmm0,0x00(%1)\n" \ + " addl $0x10,%2\n" \ + " addl $0x10,%1\n" \ + " subl $0x10,%0\n" \ + "200: testl %0,%0\n" \ + " je 300f\n" \ + " movl $0x10,%3\n" \ + " subl %0,%3\n" \ + " subl %3,%1\n" \ + " subl %3,%2\n" \ + "5: movups 0x00(%2),%%xmm0\n" \ + " movups %%xmm0,0x00(%1)\n" \ + " addl $0x10,%2\n" \ + " addl $0x10,%1\n" \ + " xorl %0,%0\n" \ + "300:\n" \ + ".section .fixup,\"ax\"\n" \ + "6: xorps %%xmm0,%%xmm0\n" \ + " movups %%xmm0,(%1)\n" \ + " movl $0x10,%3\n" \ + " subl %4,%3\n" \ + " addl %3,%1\n" \ + " movl %3,%4\n" \ + " movl %0,%3\n" \ + " subl %4,%3\n" \ + " jmp 600f\n" \ + "7: subl $0x10,%0\n" \ + " addl $0x10,%1\n" \ + "400: movl %0,%3\n" \ + " xorps %%xmm0,%%xmm0\n" \ + " jmp 600f\n" \ + "500: movntps %%xmm0,0x00(%1)\n" \ + " movntps %%xmm0,0x10(%1)\n" \ + " addl $0x20,%1\n" \ + " subl $0x20,%3\n" \ + "600: cmpl $0x1f,%3\n" \ + " ja 500b\n" \ + " cmpl $0xf,%3\n" \ + " jbe 700f\n" \ + " movntps %%xmm0,0x00(%1)\n" \ + " addl $0x10,%1\n" \ + " subl $0x10,%3\n" \ + "700: testl %3,%3\n" \ + " je 300b\n" \ + " xorl %4,%4\n" \ + " movb %4,(%1)\n" \ + " inc %1\n" \ + " dec %3\n" \ + " jmp 700b\n" \ + "8: addl %3,%1\n" \ + " movl %0,%3\n" \ + " jmp 700b\n" \ + ".previous\n" \ + ".section __ex_table,\"a\"\n" \ + " .align 4\n" \ + " .long 1b,6b\n" \ + " .long 2b,400b\n" \ + " .long 3b,7b\n" \ + " .long 4b,400b\n" \ + " .long 5b,8b\n" \ + ".previous" \ + : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(tmp), \ + "=q"(tmp2) \ + : "0"(size), "1"(to), "2"(from) \ + : "memory"); \ +} while (0) + + +unsigned long +__kni_copy_to_user_nocheck(void *to, const void *from, unsigned long n) +{ + unsigned long flags; + int recursive = 0; + char xmm_space[64]; + char xmm_reg_space[64]; /* in case we switch context */ + + if (n >= 128) { + kernel_take_fpu_kni(recursive,&xmm_space[0],&xmm_reg_space[0],flags); + __kni_copy_to_user(to,from,n); + SFENCE(); + kernel_release_fpu_kni(recursive,&xmm_space[0],flags); + } else { + __copy_user(to,from,n); + } + return n; +} + +unsigned long +__kni_copy_from_user_nocheck(void *to, const void *from, unsigned long n) +{ + unsigned long flags; + int recursive = 0; + char xmm_space[64]; + char xmm_reg_space[64]; /* in case we switch context */ + + if (n >= 128) { + kernel_take_fpu_kni(recursive,&xmm_space[0],&xmm_reg_space[0],flags); + __kni_copy_from_user(to,from,n); + SFENCE(); + kernel_release_fpu_kni(recursive,&xmm_space[0],flags); + } else { + __copy_user_zeroing(to,from,n); + } + return n; +} + + + +unsigned long +kni_copy_to_user(void *to, const void *from, unsigned long n) +{ + unsigned long flags; + int recursive = 0; + char xmm_space[64]; + char xmm_reg_space[64]; /* in case we switch context */ + + if (access_ok(VERIFY_WRITE, to, n)) { + if (n >= 128) { + kernel_take_fpu_kni(recursive,&xmm_space[0],&xmm_reg_space[0],flags); + __kni_copy_to_user(to,from,n); + SFENCE(); + kernel_release_fpu_kni(recursive,&xmm_space[0],flags); + } else { + __copy_user(to,from,n); + } + } + return n; +} + +unsigned long +kni_copy_from_user(void *to, const void *from, unsigned long n) +{ + unsigned long flags; + int recursive = 0; + char xmm_space[64]; + char xmm_reg_space[64]; /* in case we switch context */ + + if (access_ok(VERIFY_READ, from, n)) { + if (n >= 128) { + kernel_take_fpu_kni(recursive,&xmm_space[0],&xmm_reg_space[0],flags); + __kni_copy_from_user(to,from,n); + SFENCE(); + kernel_release_fpu_kni(recursive,&xmm_space[0],flags); + } else { + __copy_user_zeroing(to,from,n); + } + } + return n; +} + + Index: oldkernel/linux/arch/i386/mm/init.c diff -u linux/arch/i386/mm/init.c:1.1.1.1 linux/arch/i386/mm/init.c:1.2 --- linux/arch/i386/mm/init.c:1.1.1.1 Wed May 31 12:33:53 2000 +++ linux/arch/i386/mm/init.c Thu Jun 1 15:05:19 2000 @@ -184,34 +184,6 @@ extern char _text, _etext, _edata, __bss_start, _end; extern char __init_begin, __init_end; -#define X86_CR4_VME 0x0001 /* enable vm86 extensions */ -#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */ -#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */ -#define X86_CR4_DE 0x0008 /* enable debugging extensions */ -#define X86_CR4_PSE 0x0010 /* enable page size extensions */ -#define X86_CR4_PAE 0x0020 /* enable physical address extensions */ -#define X86_CR4_MCE 0x0040 /* Machine check enable */ -#define X86_CR4_PGE 0x0080 /* enable global pages */ -#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */ - -/* - * Save the cr4 feature set we're using (ie - * Pentium 4MB enable and PPro Global page - * enable), so that any CPU's that boot up - * after us can get the correct flags. - */ -unsigned long mmu_cr4_features __initdata = 0; - -static inline void set_in_cr4(unsigned long mask) -{ - mmu_cr4_features |= mask; - __asm__("movl %%cr4,%%eax\n\t" - "orl %0,%%eax\n\t" - "movl %%eax,%%cr4\n" - : : "irg" (mask) - :"ax"); -} - /* * allocate page table(s) for compile-time fixed mappings */ Index: oldkernel/linux/include/asm-i386/bugs.h diff -u linux/include/asm-i386/bugs.h:1.1.1.1 linux/include/asm-i386/bugs.h:1.2 --- linux/include/asm-i386/bugs.h:1.1.1.1 Wed May 31 12:33:49 2000 +++ linux/include/asm-i386/bugs.h Thu Jun 1 15:05:19 2000 @@ -18,6 +18,7 @@ */ #include +#include #include #include @@ -69,6 +70,45 @@ #endif return; } +#ifdef CONFIG_X86_FX + /* + * If we got so far we can safely turn on FXSAVE/FXRESTORE, + * but make sure we are 16-byte aligned first. + */ + if (offsetof(struct task_struct, tss.i387.hard.fxsave.fxcwd) & 15) { + /* + * This triggers a link-time error if we manage to + * break alignment somehow. + */ + extern void __buggy_fxsr_alignment(void); + + __buggy_fxsr_alignment(); + } + if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { + printk("Enabling extended fast FPU save and restore..."); + set_in_cr4(X86_CR4_OSFXSR); + printk("done.\n"); + } + /* + * Note, Katmai instructions are enabled as soon as you start + * using the FXSAVE/RESTORE stuff. This setting only + * indicates support for the masked/unmasked exceptions on + * the new PIII cpus. We don't have an Exception 16 handler + * for this yet, but we set this bit anyway. It'll kill us + * the first time we take an umasked KNI exception, but since + * no userland apps currently use KNI, it isn't an issue yet. + * We should have the handler added by then. + */ + if (boot_cpu_data.x86_capability & X86_FEATURE_XMM) { + printk("Not enabling KNI unmasked exception support\n"); + printk("Exception 19 error handler not integrated yet\n"); +#if 0 + set_in_cr4(X86_CR4_OSXMMEXCPT); + printk("done.\n"); +#endif + } +#endif + disable_serial_nr(); if (mca_pentium_flag) { /* The IBM Model 95 machines with pentiums lock up on * fpu test, so we avoid it. All pentiums have inbuilt @@ -117,23 +157,23 @@ return; if (!ignore_irq13) { printk("OK, FPU using old IRQ 13 error reporting\n"); - return; + } else { + __asm__("fninit\n\t" + "fldl %1\n\t" + "fdivl %2\n\t" + "fmull %2\n\t" + "fldl %1\n\t" + "fsubp %%st,%%st(1)\n\t" + "fistpl %0\n\t" + "fwait\n\t" + "fninit" + : "=m" (*&boot_cpu_data.fdiv_bug) + : "m" (*&x), "m" (*&y)); + if (!boot_cpu_data.fdiv_bug) + printk("OK, FPU using exception 16 error reporting.\n"); + else + printk("Hmm, FPU using exception 16 error reporting with FDIV bug.\n"); } - __asm__("fninit\n\t" - "fldl %1\n\t" - "fdivl %2\n\t" - "fmull %2\n\t" - "fldl %1\n\t" - "fsubp %%st,%%st(1)\n\t" - "fistpl %0\n\t" - "fwait\n\t" - "fninit" - : "=m" (*&boot_cpu_data.fdiv_bug) - : "m" (*&x), "m" (*&y)); - if (!boot_cpu_data.fdiv_bug) - printk("OK, FPU using exception 16 error reporting.\n"); - else - printk("Hmm, FPU using exception 16 error reporting with FDIV bug.\n"); } __initfunc(static void check_hlt(void)) @@ -419,5 +459,7 @@ check_amd_k6(); check_pentium_f00f(); check_cyrix_coma(); + boot_cpu_data.enable_fixups = 1; /* should be safe to use MMX/MMX2 */ + /* kernel functions now */ system_utsname.machine[1] = '0' + boot_cpu_data.x86; } Index: oldkernel/linux/include/asm-i386/i387.h diff -u /dev/null linux/include/asm-i386/i387.h:1.1 --- /dev/null Mon Jul 31 21:12:25 2000 +++ linux/include/asm-i386/i387.h Thu Jun 1 15:05:19 2000 @@ -0,0 +1,313 @@ +/* + * include/asm-i386/i387.h + * + * Copyright (c) 1999 Doug Ledford + * + * Made from various code bits pulled from other files + * in order to put things together in a way that made + * sense. + * + * FX/FPU support: + * Copyright (c) 1999 Ingo Molnar , + * Gabriel Paubert + */ + +#ifndef __ASM_I386_I387_H +#define __ASM_I386_I387_H + +extern int i387_hard_to_user ( struct user_i387_struct * user, + union i387_hard_union * hard); +extern int i387_user_to_hard ( union i387_hard_union * hard, + struct user_i387_struct * user); + +/* + * Fill out the reserved bits, treat it as an fsave struct since the + * union makes this work for both fsave and fxsave structs. + */ +#ifdef CONFIG_X86_FX + +#define i387_save_hard(x) \ +do { \ + if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { \ + __asm__ __volatile__("fxsave %0" \ + : "=m" ((x).hard.fxsave.fxcwd)); \ + } else { \ + __asm__ __volatile__("fnsave %0; fwait;" \ + : "=m" ((x).hard.fsave.cwd)); \ + } \ +} while(0) + +#define i387_restore_hard(x) \ +do { \ + if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { \ + __asm__ __volatile__("fxrstor %0" \ + : \ + : "m" ((x).hard.fxsave.fxcwd)); \ + } else { \ + __asm__ __volatile__("frstor %0" \ + : \ + :"m" ((x).hard.fsave.cwd)); \ + } \ +} while(0) + +#define i387_set_cwd(x,v) \ +do { \ + if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { \ + (x).fxsave.fxcwd = (short)(v); \ + } else { \ + (x).fsave.cwd = ((long)(v) | 0xffff0000); \ + } \ +} while(0) + +#define i387_set_swd(x,v) \ +do { \ + if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { \ + (x).fxsave.fxswd = (short)(v); \ + } else { \ + (x).fsave.swd = ((long)(v) | 0xffff0000); \ + } \ +} while(0) + +#define i387_set_twd(x,v) \ +do { \ + if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { \ + (x).fxsave.fxtwd = (short)(v); \ + } else { \ + (x).fsave.twd = ((long)(v) | 0xffff0000); \ + } \ +} while(0) + +static inline unsigned short fputag_KNI_to_387(unsigned char tb) { + unsigned short tw = tb; + tw = (tw | (tw << 4)) & 0x0f0f; /* zzzz7654zzzz3210 */ + tw = (tw | (tw << 2)) & 0x3333; /* zz76zz54zz32zz10 */ + tw = (tw | (tw << 1)) & 0x5555; /* z7z6z5z4z3z2z1z0 */ + tw = ~(tw * 3); + return tw; +} + +static inline unsigned char fputag_387_to_KNI(unsigned short tw) { + tw = ~tw & 0x5555; /* z7z6z5z4z3z2z1z0 */ + tw = (tw | (tw >> 1)) & 0x3333; /* zz76zz54zz32zz10 */ + tw = (tw | (tw >> 2)) & 0x0f0f; /* zzzz7654zzzz3210 */ + tw = (tw | (tw >> 4)) & 0x00ff; /* zzzzzzzz76543210 */ + return tw; +} + +#else /* CONFIG_X86_FX */ + +#define i387_save_hard(x) \ +do { \ + __asm__ __volatile__("fnsave %0; fwait;" \ + : "=m" ((x).hard.fsave.cwd)); \ +} while(0) + +#define i387_restore_hard(x) \ +do { \ + __asm__ __volatile__("frstor %0" \ + : \ + :"m" ((x).hard.fsave.cwd)); \ +} while(0) + +#define i387_set_cwd(x,v) \ +do { (x).fsave.cwd = ((long)(v) | 0xffff0000); } while(0) + +#define i387_set_swd(x,v) \ +do { (x).fsave.swd = ((long)(v) | 0xffff0000); } while(0) + +#define i387_set_twd(x,v) \ +do { (x).fsave.twd = ((long)(v) | 0xffff0000); } while(0) + +#endif /* CONFIG_X86_FX */ + +/* + * FPU lazy state save handling.. + */ +#define save_kern_fpu(tsk) do { \ + if(tsk->tss.mmx_reg_space != NULL) \ + __asm__("movq %%mm0, 0x00(%0)\n\t" \ + "movq %%mm1, 0x08(%0)\n\t" \ + "movq %%mm2, 0x10(%0)\n\t" \ + "movq %%mm3, 0x18(%0)\n\t" \ + :: "r" (tsk->tss.mmx_reg_space):"memory"); \ + if(tsk->tss.kni_reg_space != NULL) \ + __asm__("movups %%xmm0, 0x00(%0)\n\t" \ + "movups %%xmm1, 0x10(%0)\n\t" \ + "movups %%xmm2, 0x20(%0)\n\t" \ + "movups %%xmm3, 0x30(%0)\n\t" \ + :: "r" (tsk->tss.kni_reg_space):"memory"); \ +} while (0) + +#define unlazy_fpu(tsk) do { \ + if (tsk->tss.x86_fpustate & X86_FPUSTATE_KERN_ANY) { \ + save_kern_fpu(tsk); \ + if (!(tsk->flags & PF_USEDFPU)) { \ + stts(); \ + } \ + } \ + if (tsk->flags & PF_USEDFPU) { \ + if (!(tsk->tss.x86_fpustate & X86_FPUSTATE_USER_SAVED)) { \ + i387_save_hard(tsk->tss.i387); \ + } \ + tsk->flags &= ~PF_USEDFPU; \ + stts(); \ + } \ +} while (0) + +#define clear_fpu(tsk) do { \ + if ( (tsk->flags & PF_USEDFPU) || \ + (tsk->tss.x86_fpustate) ) { \ + tsk->flags &= ~PF_USEDFPU; \ + tsk->tss.x86_fpustate = 0; \ + stts(); \ + } \ +} while (0) + +/* + * For when we want to use the FPU in kernel code + * + * These functions allow the use of up to 4 KNI based xmm registers on the + * Pentium III processors or up to 4 MMX registers on Pentium MMX and above + * or compatible processors. Pick the routines that you need based on the + * regs you are going to use. Keep in mind that these are intended to be + * used only after you've verified that the processor supports these + * operations. Use them before you've done that and watch your machine go + * boom. Take a look in arch/i386/lib/best_function.c for an example of + * how to fixup the kernel with kni/mmx using functions once the CPU + * capabilities have been determined. + * + * In all of these functions: + * + * recursive - int, used to determine what the state is at restore time + * regs - char * to an array that is 32 bytes for mmx and 64 bytes for kni + * which is then used to save off the contents of the current + * regs to be recursively safe + * task_switch_regs - char * to another array of the same size as the one + * above, but this array is optional. If your function might get + * pre-empted by another task then this pointer should be non-NULL + * so that at unlazy_fpu() time in the switch_to() function we + * can save your register state (copy_*_user functions are an example + * of functions that need this, since they can take a page fault and + * while that fault is being serviced the scheduler is free to run + * another task entirely). + * irqflags - unsigned long used to store IRQ state + */ + +#define SAVE_MMX_REGS(regs) \ + __asm__ __volatile__("movq %%mm0, 0x00(%0)\n\t" \ + "movq %%mm1, 0x08(%0)\n\t" \ + "movq %%mm2, 0x10(%0)\n\t" \ + "movq %%mm3, 0x18(%0)\n\t" \ + : : "r" ((regs)) : "memory" ); + +#define RESTORE_MMX_REGS(regs) \ + __asm__ __volatile__("movq 0x00(%0), %%mm0\n\t" \ + "movq 0x08(%0), %%mm1\n\t" \ + "movq 0x10(%0), %%mm2\n\t" \ + "movq 0x18(%0), %%mm3\n\t" \ + : : "r" ((regs))); + +#define SAVE_KNI_REGS(regs) \ + __asm__ __volatile__("movups %%xmm0, 0x00(%0)\n\t" \ + "movups %%xmm1, 0x10(%0)\n\t" \ + "movups %%xmm2, 0x20(%0)\n\t" \ + "movups %%xmm3, 0x30(%0)\n\t" \ + : : "r" ((regs)) : "memory" ); + +#define RESTORE_KNI_REGS(regs) \ + __asm__ __volatile__("movups 0x00(%0), %%xmm0\n\t" \ + "movups 0x10(%0), %%xmm1\n\t" \ + "movups 0x20(%0), %%xmm2\n\t" \ + "movups 0x30(%0), %%xmm3\n\t" \ + : : "r" ((regs))); + +#define SFENCE() \ + __asm__ __volatile__("sfence":::"memory") + + +extern spinlock_t kern_fpu_lock; + +/* + * Although it seems wasteful to do a unilateral clts() in the take_fpu + * functions, the reason I did it that way is because the alternative is + * to test for: + * + * if ( ( (current->flags & PF_USEDFPU) && + * (current->tss.x86_fpustate & X86_FPUSTATE_USER_SAVED) ) || + * ( !(current->flags & PF_USEDFPU) && + * !(current->tss.x86_fpustate & X86_FPUSTATE_KERN_ANY) ) ) + * + */ + +#define kernel_take_fpu_mmx(recursive, regs, task_switch_regs, irqflags) do { \ + spin_lock_irqsave(&kern_fpu_lock, (irqflags)); \ + clts(); \ + (recursive) = (current->tss.x86_fpustate & X86_FPUSTATE_KERN_ANY); \ + if ( (current->flags & PF_USEDFPU) && \ + !(current->tss.x86_fpustate & X86_FPUSTATE_USER_SAVED) ){ \ + i387_save_hard(current->tss.i387); \ + current->tss.x86_fpustate |= X86_FPUSTATE_USER_SAVED; \ + } \ + if ((recursive) & X86_FPUSTATE_KERN_MMX) { \ + SAVE_MMX_REGS((regs)); \ + } else { \ + current->tss.mmx_reg_space = (task_switch_regs); \ + current->tss.x86_fpustate |= X86_FPUSTATE_KERN_MMX; \ + } \ + spin_unlock_irqrestore(&kern_fpu_lock, (irqflags)); \ +} while (0) + +#define kernel_release_fpu_mmx(recursive, regs, irqflags) do { \ + spin_lock_irqsave(&kern_fpu_lock, (irqflags)); \ + if ((recursive) & X86_FPUSTATE_KERN_MMX) { \ + RESTORE_MMX_REGS((regs)); \ + } else { \ + current->tss.x86_fpustate &= ~X86_FPUSTATE_KERN_MMX; \ + current->tss.mmx_reg_space = NULL; \ + } \ + if ((recursive) == 0) { \ + stts(); \ + } \ + spin_unlock_irqrestore(&kern_fpu_lock, (irqflags)); \ +} while (0) + +#define kernel_take_fpu_kni(recursive, regs, task_switch_regs, irqflags) do { \ + spin_lock_irqsave(&kern_fpu_lock, (irqflags)); \ + clts(); \ + (recursive) = current->tss.x86_fpustate; \ + if ( (current->flags & PF_USEDFPU) || \ + (current->tss.x86_fpustate & X86_FPUSTATE_KERN_KNI) ) { \ + SAVE_KNI_REGS((regs)); \ + } \ + if (!(current->tss.x86_fpustate & X86_FPUSTATE_KERN_KNI)) { \ + current->tss.kni_reg_space = (task_switch_regs); \ + current->tss.x86_fpustate |= X86_FPUSTATE_KERN_KNI; \ + } \ + spin_unlock_irqrestore(&kern_fpu_lock, (irqflags)); \ +} while (0) + + +#define kernel_release_fpu_kni(recursive, regs, irqflags) do { \ + spin_lock_irqsave(&kern_fpu_lock, (irqflags)); \ + if ( (current->tss.x86_fpustate & X86_FPUSTATE_USER_SAVED) && \ + !(((recursive) & X86_FPUSTATE_USER_SAVED) && \ + (current->flags & PF_USEDFPU)) ) { \ + i387_restore_hard(current->tss.i387); \ + current->tss.x86_fpustate &= ~X86_FPUSTATE_USER_SAVED; \ + } \ + if ( ((recursive) & X86_FPUSTATE_KERN_KNI) || \ + (current->flags & PF_USEDFPU) ) { \ + RESTORE_KNI_REGS((regs)); \ + } \ + if (((recursive) & X86_FPUSTATE_KERN_KNI) == 0) { \ + current->tss.x86_fpustate &= ~X86_FPUSTATE_KERN_KNI; \ + current->tss.kni_reg_space = NULL; \ + } \ + if ( ((recursive) == 0) && ((current->flags & PF_USEDFPU) == 0) ) { \ + stts(); \ + } \ + spin_unlock_irqrestore(&kern_fpu_lock, (irqflags)); \ +} while (0) + + +#endif /* __ASM_I386_I387_H */ Index: oldkernel/linux/include/asm-i386/io.h diff -u linux/include/asm-i386/io.h:1.1.1.1 linux/include/asm-i386/io.h:1.2 --- linux/include/asm-i386/io.h:1.1.1.1 Wed May 31 12:33:49 2000 +++ linux/include/asm-i386/io.h Thu Jun 1 15:05:19 2000 @@ -157,9 +157,9 @@ #define writew(b,addr) (*(volatile unsigned short *) __io_virt(addr) = (b)) #define writel(b,addr) (*(volatile unsigned int *) __io_virt(addr) = (b)) -#define memset_io(a,b,c) memset(__io_virt(a),(b),(c)) -#define memcpy_fromio(a,b,c) memcpy((a),__io_virt(b),(c)) -#define memcpy_toio(a,b,c) memcpy(__io_virt(a),(b),(c)) +#define memset_io(a,b,c) __memset_generic(__io_virt(a),(b),(c)) +#define memcpy_fromio(a,b,c) __memcpy((a),__io_virt(b),(c)) +#define memcpy_toio(a,b,c) __memcpy(__io_virt(a),(b),(c)) /* * Again, i386 does not require mem IO specific function. Index: oldkernel/linux/include/asm-i386/processor.h diff -u linux/include/asm-i386/processor.h:1.1.1.1 linux/include/asm-i386/processor.h:1.2 --- linux/include/asm-i386/processor.h:1.1.1.1 Wed May 31 12:33:49 2000 +++ linux/include/asm-i386/processor.h Thu Jun 1 15:05:19 2000 @@ -7,10 +7,11 @@ #ifndef __ASM_I386_PROCESSOR_H #define __ASM_I386_PROCESSOR_H +#include #include #include -#include #include +#include /* * CPU type and hardware bug flags. Kept separately for each CPU. @@ -29,6 +30,7 @@ char rfu; int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */ __u32 x86_capability; + __u32 mmu_cr4_features; char x86_vendor_id[16]; char x86_model_id[64]; int x86_cache_size; /* in KB - valid for CPUS which support this @@ -36,6 +38,7 @@ int fdiv_bug; int f00f_bug; int coma_bug; + int enable_fixups; unsigned long loops_per_sec; unsigned long *pgd_quick; unsigned long *pte_quick; @@ -70,16 +73,16 @@ #define X86_FEATURE_PGE 0x00002000 /* Page Global Enable */ #define X86_FEATURE_MCA 0x00004000 /* Machine Check Architecture */ #define X86_FEATURE_CMOV 0x00008000 /* CMOV instruction (FCMOVCC and FCOMI too if FPU present) */ -#define X86_FEATURE_PAT 0x00010000 /* Page Attribute Table */ +#define X86_FEATURE_PAT 0x00010000 /* Page Attribute Table */ #define X86_FEATURE_PSE36 0x00020000 /* 36-bit PSEs */ -#define X86_FEATURE_18 0x00040000 +#define X86_FEATURE_PN 0x00040000 /* 96 bit CPU serial # */ #define X86_FEATURE_19 0x00080000 #define X86_FEATURE_20 0x00100000 #define X86_FEATURE_21 0x00200000 #define X86_FEATURE_22 0x00400000 #define X86_FEATURE_MMX 0x00800000 /* multimedia extensions */ #define X86_FEATURE_FXSR 0x01000000 /* FXSAVE and FXRSTOR instructions (fast save and restore of FPU context), and CR4.OSFXSR (OS uses these instructions) available */ -#define X86_FEATURE_25 0x02000000 +#define X86_FEATURE_XMM 0x02000000 /* Intel MMX2 instruction set */ #define X86_FEATURE_26 0x04000000 #define X86_FEATURE_27 0x08000000 #define X86_FEATURE_28 0x10000000 @@ -89,6 +92,82 @@ extern struct cpuinfo_x86 boot_cpu_data; +#define X86_CR4_VME 0x0001 /* enable vm86 extensions */ +#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */ +#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */ +#define X86_CR4_DE 0x0008 /* enable debugging extensions */ +#define X86_CR4_PSE 0x0010 /* enable page size extensions */ +#define X86_CR4_PAE 0x0020 /* enable physical address extensions */ +#define X86_CR4_MCE 0x0040 /* Machine check enable */ +#define X86_CR4_PGE 0x0080 /* enable global pages */ +#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */ +#define X86_CR4_OSFXSR 0x0200 /* fast FPU save/restore */ +#define X86_CR4_OSXMMEXCPT 0x0400 /* KNI (MMX2) unmasked exception 16 */ + /* handler is available */ + +/* + * Some defines for using with the x86_fpu_state variable in the new + * thread struct. We use these because the rest of the kernel doesn't + * like us messing with current->flags at arbitrary times ;-) + */ +#define X86_FPUSTATE_USER_SAVED 0x0001 +#define X86_FPUSTATE_KERN_ANY 0x0006 +#define X86_FPUSTATE_KERN_MMX 0x0002 +#define X86_FPUSTATE_KERN_KNI 0x0004 + +/* + * Save the cr4 feature set we're using (ie + * Pentium 4MB enable and PPro Global page + * enable), so that any CPU's that boot up + * after us can get the correct flags. + */ + +static inline void set_in_cr4(unsigned long mask) +{ + boot_cpu_data.mmu_cr4_features |= mask; + __asm__("movl %%cr4,%%eax\n\t" + "orl %0,%%eax\n\t" + "movl %%eax,%%cr4\n" + : : "irg" (mask) + :"ax"); +} + +extern int disable_x86_serial_nr; + +static inline void disable_serial_nr(void) +{ + if ( disable_x86_serial_nr && + (boot_cpu_data.x86_capability & X86_FEATURE_PN) ) { + printk("Disabling CPUID Serial number..."); + __asm__ __volatile__( "movl $0x119,%%ecx\n\t" + "rdmsr\n\t" + "orl $0x00200000,%%eax\n\t" + "wrmsr":::"ax","dx","cx","memory"); + /* + * We might need to re-read the x86 capability set now to + * make sure that the PN bit has been turned off so + * we know that the serial number stuff is disabled + * + * Note: we don't need to re-read the registers. We can tell + * by rebooting that the flag is off since on reboots that + * don't power the machine down the serial number doesn't + * get disabled any more because it already is disabled. + */ + printk("done.\n"); + } +} + +static inline void load_default_mxcsr(void) +{ + long mxcsr = 0x1f80; + + if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) && + (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) { + __asm__("ldmxcsr %0": :"m" (mxcsr)); + } +} + + #ifdef __SMP__ extern struct cpuinfo_x86 cpu_data[]; #define current_cpu_data cpu_data[smp_processor_id()] @@ -170,37 +249,62 @@ * Size of io_bitmap in longwords: 32 is ports 0-0x3ff. */ #define IO_BITMAP_SIZE 32 + +struct i387_hard_fsave { + long cwd; + long swd; + long twd; + long fip; + long fcs; + long foo; + long fos; + long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ +}; -struct i387_hard_struct { - long cwd; - long swd; - long twd; - long fip; - long fcs; - long foo; - long fos; - long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ - long status; /* software status information */ +/* + * has to be 128-bit aligned + */ +struct i387_hard_fxsave { + unsigned short fxcwd; + unsigned short fxswd; + unsigned short fxtwd; + unsigned short fxfopcode; + long fxfip; + short fxfcs; + short __reserved_00; + long fxfoo; + short fxfos; + short __reserved_01; + long mxcsr; + long __reserved_02; + long st_space[32]; /* 8*16 bytes for each FP/MMX-reg = 128 bytes */ + long xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ + long __reserved_03 [14*4]; /* 14 16byte lines for remainder */ +} __attribute__ ((aligned (16))); + +union i387_hard_union { + struct i387_hard_fxsave fxsave; + struct i387_hard_fsave fsave; }; struct i387_soft_struct { - long cwd; - long swd; - long twd; - long fip; - long fcs; - long foo; - long fos; - long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ - unsigned char ftop, changed, lookahead, no_update, rm, alimit; - struct info *info; - unsigned long entry_eip; + long cwd; + long swd; + long twd; + long fip; + long fcs; + long foo; + long fos; + long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ + unsigned char ftop, changed, lookahead, no_update, rm, alimit; + struct info *info; + unsigned long entry_eip; }; union i387_union { - struct i387_hard_struct hard; + union i387_hard_union hard; struct i387_soft_struct soft; -}; +} __attribute__ ((aligned(16))); typedef struct { unsigned long seg; @@ -242,6 +346,10 @@ struct vm86_struct * vm86_info; unsigned long screen_bitmap; unsigned long v86flags, v86mask, v86mode, saved_esp0; + volatile long x86_fpustate; + char *mmx_reg_space; + char *kni_reg_space; + }; #define INIT_MMAP \ @@ -263,8 +371,9 @@ {~0, }, /* ioperm */ \ _TSS(0), 0, 0, 0, (mm_segment_t) { 0 }, /* obsolete */ \ { 0, }, \ - { { 0, }, }, /* 387 state */ \ + { { { 0, }, }, }, /* 387 state */ \ NULL, 0, 0, 0, 0, 0, /* vm86_info */ \ + 0, NULL, NULL /* fpustate, mmx, and xmm_reg_space */ \ } #define start_thread(regs, new_eip, new_esp) do { \ @@ -289,27 +398,6 @@ extern void copy_segments(int nr, struct task_struct *p, struct mm_struct * mm); extern void release_segments(struct mm_struct * mm); extern void forget_segments(void); - -/* - * FPU lazy state save handling.. - */ -#define save_fpu(tsk) do { \ - asm volatile("fnsave %0\n\tfwait":"=m" (tsk->tss.i387)); \ - tsk->flags &= ~PF_USEDFPU; \ - stts(); \ -} while (0) - -#define unlazy_fpu(tsk) do { \ - if (tsk->flags & PF_USEDFPU) \ - save_fpu(tsk); \ -} while (0) - -#define clear_fpu(tsk) do { \ - if (tsk->flags & PF_USEDFPU) { \ - tsk->flags &= ~PF_USEDFPU; \ - stts(); \ - } \ -} while (0) /* * Return saved PC of a blocked thread. Index: oldkernel/linux/include/asm-i386/string.h diff -u linux/include/asm-i386/string.h:1.1.1.1 linux/include/asm-i386/string.h:1.2 --- linux/include/asm-i386/string.h:1.1.1.1 Wed May 31 12:33:49 2000 +++ linux/include/asm-i386/string.h Thu Jun 1 15:05:19 2000 @@ -14,6 +14,10 @@ #include #else +#ifndef _LINUX_CONFIG_H +#include +#endif + /* * This string-include defines all string functions as inline * functions. Use gcc. It also assumes ds=es=data space, this should be @@ -293,10 +297,21 @@ } #define __HAVE_ARCH_MEMCPY +#ifdef CONFIG_X86_CPU_OPTIMIZATIONS +extern void * __kni_memcpy(void * to, const void * from, size_t n); +extern void * best_memcpy(void * to, const void * from, size_t n); +#define memcpy(t, f, n) \ +(__builtin_constant_p(n) ? \ + (((n) < 128) ? \ + __constant_memcpy((t),(f),(n)) : \ + best_memcpy((t),(f),(n))) : \ + best_memcpy((t),(f),(n))) +#else #define memcpy(t, f, n) \ (__builtin_constant_p(n) ? \ __constant_memcpy((t),(f),(n)) : \ __memcpy((t),(f),(n))) +#endif #define __HAVE_ARCH_MEMMOVE extern inline void * memmove(void * dest,const void * src, size_t n) @@ -449,21 +464,32 @@ #undef COMMON } -#define __constant_c_x_memset(s, c, count) \ -(__builtin_constant_p(count) ? \ - __constant_c_and_count_memset((s),(c),(count)) : \ - __constant_c_memset((s),(c),(count))) +#define __constant_x_count_memset(s, c, count) \ +(__builtin_constant_p(c) ? \ + __constant_c_and_count_memset((s),(0x01010101UL*(unsigned char)(c)),(count)) :\ + __constant_count_memset((s),(c),(count))) #define __memset(s, c, count) \ -(__builtin_constant_p(count) ? \ - __constant_count_memset((s),(c),(count)) : \ +(__builtin_constant_p(c) ? \ + __constant_c_memset((s),(0x01010101UL*(unsigned char)(c)),(count)) : \ __memset_generic((s),(c),(count))) #define __HAVE_ARCH_MEMSET +#ifdef CONFIG_X86_CPU_OPTIMIZATIONS +extern void * __kni_memset(void * s, char c, size_t count); +extern void * best_memset(void * s, char c, size_t count); #define memset(s, c, count) \ -(__builtin_constant_p(c) ? \ - __constant_c_x_memset((s),(0x01010101UL*(unsigned char)(c)),(count)) : \ +(__builtin_constant_p(count) ? \ + (((count) < 128) ? \ + __constant_x_count_memset((s),(c),(count)) : \ + best_memset((s),(c),(count))) : \ + best_memset((s),(c),(count))) +#else +#define memset(s, c, count) \ +(__builtin_constant_p(count) ? \ + __constant_x_count_memset((s),(c),(count)) : \ __memset((s),(c),(count))) +#endif /* * find the first occurrence of byte 'c', or 1 past the area if none Index: oldkernel/linux/include/asm-i386/uaccess.h diff -u linux/include/asm-i386/uaccess.h:1.1.1.1 linux/include/asm-i386/uaccess.h:1.2 --- linux/include/asm-i386/uaccess.h:1.1.1.1 Wed May 31 12:33:49 2000 +++ linux/include/asm-i386/uaccess.h Thu Jun 1 15:05:19 2000 @@ -571,20 +571,62 @@ return n; } +#ifdef CONFIG_X86_CPU_OPTIMIZATIONS + +/* + * The XMM based copy_*_user() function declarations...the best_*_user() + * routines need this + */ +unsigned long kni_copy_to_user(void *, const void *, unsigned long); +unsigned long kni_copy_from_user(void *, const void *, unsigned long); +unsigned long __kni_copy_to_user_nocheck(void *, const void *, unsigned long); +unsigned long __kni_copy_from_user_nocheck(void *, const void *, unsigned long); + +unsigned long best_copy_to_user(void *, const void *, unsigned long); +unsigned long best_copy_from_user(void *, const void *, unsigned long); +unsigned long __best_copy_to_user(void *, const void *, unsigned long); +unsigned long __best_copy_from_user(void *, const void *, unsigned long); + #define copy_to_user(to,from,n) \ (__builtin_constant_p(n) ? \ + (((n) < 128) ? \ __constant_copy_to_user((to),(from),(n)) : \ - __generic_copy_to_user((to),(from),(n))) + best_copy_to_user((to),(from),(n))) : \ + best_copy_to_user((to),(from),(n))) #define copy_from_user(to,from,n) \ (__builtin_constant_p(n) ? \ + (((n) < 128) ? \ __constant_copy_from_user((to),(from),(n)) : \ - __generic_copy_from_user((to),(from),(n))) + best_copy_from_user((to),(from),(n))) : \ + best_copy_from_user((to),(from),(n))) -#define copy_to_user_ret(to,from,n,retval) ({ if (copy_to_user(to,from,n)) return retval; }) +#define __copy_to_user(to,from,n) \ + (__builtin_constant_p(n) ? \ + (((n) < 128) ? \ + __constant_copy_to_user_nocheck((to),(from),(n)) : \ + __best_copy_to_user((to),(from),(n))) : \ + __best_copy_to_user((to),(from),(n))) -#define copy_from_user_ret(to,from,n,retval) ({ if (copy_from_user(to,from,n)) return retval; }) +#define __copy_from_user(to,from,n) \ + (__builtin_constant_p(n) ? \ + (((n) < 128) ? \ + __constant_copy_from_user_nocheck((to),(from),(n)) : \ + __best_copy_from_user((to),(from),(n))) : \ + __best_copy_from_user((to),(from),(n))) + +#else /* CONFIG_X86_CPU_OPTIMIZATIONS */ +#define copy_to_user(to,from,n) \ + (__builtin_constant_p(n) ? \ + __constant_copy_to_user((to),(from),(n)) : \ + __generic_copy_to_user((to),(from),(n))) + +#define copy_from_user(to,from,n) \ + (__builtin_constant_p(n) ? \ + __constant_copy_from_user((to),(from),(n)) : \ + __generic_copy_from_user((to),(from),(n))) + #define __copy_to_user(to,from,n) \ (__builtin_constant_p(n) ? \ __constant_copy_to_user_nocheck((to),(from),(n)) : \ @@ -594,6 +636,11 @@ (__builtin_constant_p(n) ? \ __constant_copy_from_user_nocheck((to),(from),(n)) : \ __generic_copy_from_user_nocheck((to),(from),(n))) +#endif + +#define copy_to_user_ret(to,from,n,retval) ({ if (copy_to_user(to,from,n)) return retval; }) + +#define copy_from_user_ret(to,from,n,retval) ({ if (copy_from_user(to,from,n)) return retval; }) long strncpy_from_user(char *dst, const char *src, long count); long __strncpy_from_user(char *dst, const char *src, long count); Index: oldkernel/linux/init/main.c diff -u linux/init/main.c:1.4 linux/init/main.c:1.5 --- linux/init/main.c:1.4 Thu Jun 1 15:01:35 2000 +++ linux/init/main.c Thu Jun 1 15:05:19 2000 @@ -103,6 +103,7 @@ #ifdef __i386__ extern void ioapic_pirq_setup(char *str, int *ints); extern void ioapic_setup(char *str, int *ints); +extern void x86_serial_nr_setup(char *str, int *ints); #endif extern void no_scroll(char *str, int *ints); extern void kbd_reset_setup(char *str, int *ints); @@ -644,6 +645,9 @@ { "noapic", ioapic_setup }, { "pirq=", ioapic_pirq_setup }, #endif +#endif +#ifdef __i386__ + { "x86_serial_nr", x86_serial_nr_setup }, #endif #ifdef CONFIG_BLK_DEV_RAM { "ramdisk_start=", ramdisk_start_setup },