head.s
时间:2006-11-08 来源:ooike
/*
* linux/arch/i386/kernel/head.S -- the 32-bit startup code.
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
* Enhanced CPU detection and feature setting code by Mike Jagdis
* and Martin Mares, November 1997.
*/
.text
#include <linux/config.h>
#include <linux/threads.h>
#include <linux/linkage.h>
#include <asm/segment.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/desc.h>
#include <asm/cache.h>
#include <asm/thread_info.h>
#include <asm/asm_offsets.h>
#include <asm/setup.h>
/*
* References to members of the new_cpu_data structure.
*/
#define X86 new_cpu_data+CPUINFO_x86
#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor
#define X86_MODEL new_cpu_data+CPUINFO_x86_model
#define X86_MASK new_cpu_data+CPUINFO_x86_mask
#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math
#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level
#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability
#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id
/*
* This is how much memory *in addition to the memory covered up to
* and including _end* we need mapped initially. We need one bit for
* each possible page, but only in low memory, which means
* 2^32/4096/8 = 128K worst case (4G/4G split.)
*
* Modulo rounding, each megabyte assigned here requires a kilobyte of
* memory, which is currently unreclaimed.
*
* This should be a multiple of a page.
*/
#define INIT_MAP_BEYOND_END (128*1024)
/*
* 32-bit kernel entrypoint; only used by the boot CPU. On entry,
* %esi points to the real-mode code as a 32-bit pointer.
* CS and DS must be 4 GB flat segments, but we don't depend on
* any particular GDT layout, because we load our own as soon as we
* can.
*/
/*ENTRY定义在include/linux/linkage.h中*/
/*#define __ALIGN .align 4,0x90*/
/*#define ALIGN __ALIGN*/
/*#define ENTRY(name) \*/
/* .globl name; \*/
/* ALIGN; \*/
/* name: */
/*.globl symbol(可以使定义的符号被ld引用)*/
/*
.global
Syntax: .global symbol
.global makes the symbol visible to ld. If you define symbol in your partial program, its value is made available to other partial programs that are linked with it. Otherwise, symbol takes its attributes from a symbol of the same name from another file linked into the same program.
Both spellings (.globl and .global) are accepted, for compatibility with other assemblers. .xdef is also accepted as a synonym for .global.
*/
/*align (使"位置计数器"作相应的对齐操作)
.align
Syntax: .align alignment[, [fill][, max]]
Pad the location counter (in the current subsection) to a particular storage boundary. alignment (which must be absolute) is the alignment required, as described below.
fill (also absolute) gives the fill value to be stored in the padding bytes. It (and the comma) may be omitted. If it is omitted, the padding bytes are normally zero. However, on some systems, if the section is marked as containing code and the fill value is omitted, the space is filled with no-op instructions (I didn't checked whether this is the case in TIGCC).
max is also absolute, and is also optional. If it is present, it is the maximum number of bytes that should be skipped by this alignment directive. If doing the alignment would require skipping more bytes than the specified maximum, then the alignment is not done at all. You can omit the fill value (the second argument) entirely by simply using two commas after the required alignment; this can be useful if you want the alignment to be filled with no-op instructions when appropriate.
The way the required alignment is specified varies from system to system. For the a29k, hppa, m68k, m88k, w65, sparc, Xtensa, and Renesas / SuperH SH, and i386 using ELF format, the first expression is the alignment request in bytes. For example .align 8 advances the location counter until it is a multiple of 8. If the location counter is already a multiple of 8, no change is needed.
For other systems, including the i386 using a.out format, and the arm and strongarm, it is the number of low-order zero bits the location counter must have after advancement. For example .align 3 advances the location counter until it a multiple of 8. If the location counter is already a multiple of 8, no change is needed.
This inconsistency is due to the different behaviors of the various native assemblers for these systems which as must emulate. as also provides .balign and .p2align directives, which have a consistent behavior across all architectures (but are specific to as).
*/
ENTRY(startup_32)
/*
* Set segments to known values.
*/
cld
lgdt boot_gdt_descr - __PAGE_OFFSET
movl $(__BOOT_DS),%eax
movl %eax,%ds
movl %eax,%es
movl %eax,%fs
movl %eax,%gs
/*
* Clear BSS first so that there are no surprises...
* No need to cld as DF is already clear from cld above...
*/
xorl %eax,%eax
movl $__bss_start - __PAGE_OFFSET,%edi
movl $__bss_stop - __PAGE_OFFSET,%ecx
subl %edi,%ecx
shrl ,%ecx
rep ; stosl
/*
* Initialize page tables. This creates a PDE and a set of page
* tables, which are located immediately beyond _end. The variable
* init_pg_tables_end is set up to point to the first "safe" location.
* Mappings are created both at virtual address 0 (identity mapping)
* and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END.
*
* Warning: don't use %esi or the stack in this code. However, %esp
* can be used as a GPR if you really need it...
*/
/*---这时系统应当已经进入保护模式,但是还没有启动分页*/
/*---将系统的第一页表的线形地址减少__PAGE_OFFSET,并作为写入的目标*/
/*---大于__PAGE_OFFSET的地址将作为系统使用*/
/*---对于编译器而言认为pg0的物理地址为0xc0102000,这是由vmlinux.lds指定的*/
/*---但是实际的装载的地址是0x00100000+0xc0002000*/
/*---前者是build生成内核映像时,head.S的偏移,后者是pg0在head.S中的偏移*/
/*---参见makefile和compress/tool下的build.文件*/
/*--- __PAGE_OFFSET=0xC0000000*/
page_pde_offset = (__PAGE_OFFSET >> 20);
movl $(pg0 - __PAGE_OFFSET), %edi /* %edi = 0xC0102000 - 0xC0000000 */
movl $(swapper_pg_dir - __PAGE_OFFSET), %edx /* %edx = 0xC0101000 - 0xC0000000 */
movl $0x007, %eax /* 0x007 = PRESENT+RW+USER */
/*---设置第一个页表和第二个页表的每一项的物理地址*/
/*---第一个页表映射0-4m的物理地址了,第二个页表映射4-8m的物理地址了*/
/*---pg0的实际装载地址为0x00100000+0x2000*/
/*---pg1的实际装载地址为0x00100000+0x3000*/
/*---empty_zero_page的装载地址为0x00100000+0x4000*/
10:
/* Create PDE entry */
/* %ecx = %edi + 0x007 = pg0(0x00102000) + 0x007 */
/* 这里产生了第一个页目录项(8个字节,指向一个Page Group),
并将目录项值存入到寄存器%ecx中,
pg0=0x00102007,代表一个页表(Page Group)
在物理地址的0x00102000,位置在后面的bss段里*/
// %edi = 0x00102000
// %ecx = any
leal 0x007(%edi),%ecx
// %edi = 0x00102000
// %ecx = 0x00102007
/* Store identity PDE entry */
/* 将表项pg0存入到内存中,即%edx指向的物理地址 = 0x00101000,
此处正好为(swapper_pg_dir)Global DIR Entry的物理地址.
所以这时,物理内存中0x00101000 - 0x00101007 的八个字节
中存储了pg0的值.*/
// %ecx = 0x00102007
// %edx = 0x00101000
// (%edx) ==> 00000000
movl %ecx,(%edx)
// %ecx = 0x00102007
// %edx = 0x00101000
// (%edx) ==> 00102007
/* Store kernel PDE entry */
// %ecx = 0x00102007
// %edx = 0x00101000
// page_pde_offset = 0xC00 (为swapper_pg_dir在PGT表中的偏移)
movl %ecx,page_pde_offset(%edx)
// %ecx = 0x00102007
// %edx = 0x00101000
/* 以上两条命令执行完后, swapper_pg_dir 指向的 PGD中, pg0 与 pgC00均指向同一个pg */
addl $4,%edx
movl $1024, %ecx
11:
/* stosl将%eax中的值保存到%edi所指向的内存单元中,而后移动
edi到下一个内存单元 */
// %eax = 0x00000007
// %edi = 0x00102000
// (%edi) => 00000000
stosl
/* 将%eax中的值加0x1000,即增加4K(正好为一个物理页面的长度
然后使用loop 11b跳回到11:处,继续设置%edi所指向的内容.
该循环指向完毕后, %edi所指向内存中连续的4K空间内容为
0x00102 000 0x00000 007
0x00102 008 0x00001 007
...
0x00102 FF8 0x003FF 007
正好为一个pg(page group的内容,其所映射的物理空间为4M,
既从0x00000000--0x003FFFFF)
*/
addl $0x1000,%eax
loop 11b
/* End condition: we must map up to and including INIT_MAP_BEYOND_END */
/* bytes beyond the end of our own page tables; the +0x007 is the attribute bits */
/* INIT_MAP_BEYOND_END ==
// INIT_MAP_BEYOND_END = 128*1024
// %edi = 0x00103000
// %ebp = any
leal (INIT_MAP_BEYOND_END+0x007)(%edi),%ebp
// %ebp = 0x00103000 + 0x20000 + 0x7 = 0x00123007
// %ebp = 0x00123 007
// %eax = 0x00000 007 + 0x003FF 000
cmpl %ebp,%eax
jb 10b
movl %edi,(init_pg_tables_end - __PAGE_OFFSET)
#ifdef CONFIG_SMP
xorl %ebx,%ebx /* This is the boot CPU (BSP) */
jmp 3f
/*
* Non-boot CPU entry point; entered from trampoline.S
* We can't lgdt here, because lgdt itself uses a data segment, but
* we know the trampoline has already loaded the boot_gdt_table GDT
* for us.
*/
ENTRY(startup_32_smp)
cld
movl $(__BOOT_DS),%eax
movl %eax,%ds
movl %eax,%es
movl %eax,%fs
movl %eax,%gs
/*
* New page tables may be in 4Mbyte page mode and may
* be using the global pages.
*
* NOTE! If we are on a 486 we may have no cr4 at all!
* So we do not try to touch it unless we really have
* some bits in it to set. This won't work if the BSP
* implements cr4 but this AP does not -- very unlikely
* but be warned! The same applies to the pse feature
* if not equally supported. --macro
*
* NOTE! We have to correct for the fact that we're
* not yet offset PAGE_OFFSET..
*/
#define cr4_bits mmu_cr4_features-__PAGE_OFFSET
movl cr4_bits,%edx
andl %edx,%edx
jz 6f
movl %cr4,%eax # Turn on paging options (PSE,PAE,..)
orl %edx,%eax
movl %eax,%cr4
btl , %eax # check if PAE is enabled
jnc 6f
/* Check if extended functions are implemented */
movl {fckeditor}x80000000, %eax
cpuid
cmpl {fckeditor}x80000000, %eax
jbe 6f
mov {fckeditor}x80000001, %eax
cpuid
/* Execute Disable bit supported? */
btl , %edx
jnc 6f
/* Setup EFER (Extended Feature Enable Register) */
movl {fckeditor}xc0000080, %ecx
rdmsr
btsl , %eax
/* Make changes effective */
wrmsr
6:
/* This is a secondary processor (AP) */
xorl %ebx,%ebx
incl %ebx
3:
#endif /* CONFIG_SMP */
/*
* Enable paging
* 设置CR0寄存器,启用分页机制,此时CPU正式进入了分页管理模式
*/
/* %eax = 0xC0101000 - 0xC0000000 */
movl $swapper_pg_dir-__PAGE_OFFSET,%eax
movl %eax,%cr3 /* set the page table pointer.. */
movl %cr0,%eax
orl 0x80000000,%eax
movl %eax,%cr0 /* ..and set paging (PG) bit */
ljmp $__BOOT_CS,1f /* Clear prefetch and normalize %eip */
1:
/* Set up the stack pointer */
/* 将堆栈设置在了stack_start处 */
lss stack_start,%esp
/*
* Initialize eflags. Some BIOS's leave bits like NT set. This would
* confuse the debugger if this code is traced.
* XXX - best to initialize before switching to protected mode.
*/
pushl $0
popfl
#ifdef CONFIG_SMP
andl %ebx,%ebx
jz 1f /* Initial CPU cleans BSS */
jmp checkCPUtype
1:
#endif /* CONFIG_SMP */
/*
* start system 32-bit setup. We need to re-do some of the things done
* in 16-bit mode for the "real" operations.
*/
/* 初始化中断向量表,设置默认的中断相应函数 */
call setup_idt
/*
* Copy bootup parameters out of the way.
* Note: %esi still has the pointer to the real-mode data.
*/
/*将启动时的参数传给内核*/
movl $boot_params,%edi
movl $(PARAM_SIZE/4),%ecx
cld
rep
movsl
movl boot_params+NEW_CL_POINTER,%esi
andl %esi,%esi
jnz 2f # New command line protocol
cmpw $(OLD_CL_MAGIC),OLD_CL_MAGIC_ADDR
jne 1f
movzwl OLD_CL_OFFSET,%esi
addl $(OLD_CL_BASE_ADDR),%esi
2:
movl $saved_command_line,%edi
movl $(COMMAND_LINE_SIZE/4),%ecx
rep
movsl
1:
checkCPUtype:
movl $-1,X86_CPUID # -1 for no CPUID initially
/* check if it is 486 or 386. */
/*
* XXX - this does a lot of unnecessary setup. Alignment checks don't
* apply at our cpl of 0 and the stack ought to be aligned already, and
* we don't need to preserve eflags.
*/
movb ,X86 # at least 386
pushfl # push EFLAGS
popl %eax # get EFLAGS
movl %eax,%ecx # save original EFLAGS
xorl {fckeditor}x240000,%eax # flip AC and ID bits in EFLAGS
pushl %eax # copy to EFLAGS
popfl # set EFLAGS
pushfl # get new EFLAGS
popl %eax # put it in eax
xorl %ecx,%eax # change in flags
pushl %ecx # restore original EFLAGS
popfl
testl {fckeditor}x40000,%eax # check if AC bit changed
je is386
movb ,X86 # at least 486
testl {fckeditor}x200000,%eax # check if ID bit changed
je is486
/* get vendor info */
xorl %eax,%eax # call CPUID with 0 -> return vendor ID
cpuid
movl %eax,X86_CPUID # save CPUID level
movl %ebx,X86_VENDOR_ID # lo 4 chars
movl %edx,X86_VENDOR_ID+4 # next 4 chars
movl %ecx,X86_VENDOR_ID+8 # last 4 chars
orl %eax,%eax # do we have processor info as well?
je is486
movl ,%eax # Use the CPUID instruction to get CPU type
cpuid
movb %al,%cl # save reg for future use
andb {fckeditor}x0f,%ah # mask processor family
movb %ah,X86
andb {fckeditor}xf0,%al # mask model
shrb ,%al
movb %al,X86_MODEL
andb {fckeditor}x0f,%cl # mask mask revision
movb %cl,X86_MASK
movl %edx,X86_CAPABILITY
is486: movl {fckeditor}x50022,%ecx # set AM, WP, NE and MP
jmp 2f
is386: movl ,%ecx # set MP
2: movl %cr0,%eax
andl {fckeditor}x80000011,%eax # Save PG,PE,ET
orl %ecx,%eax
movl %eax,%cr0
call check_x87
incb ready
lgdt cpu_gdt_descr
lidt idt_descr
ljmp $(__KERNEL_CS),f
1: movl $(__KERNEL_DS),%eax # reload all the segment registers
movl %eax,%ss # after changing gdt.
movl $(__USER_DS),%eax # DS/ES contains default USER segment
movl %eax,%ds
movl %eax,%es
xorl %eax,%eax # Clear FS/GS and LDT
movl %eax,%fs
movl %eax,%gs
lldt %ax
cld # gcc2 wants the direction flag cleared at all times
#ifdef CONFIG_SMP
movb ready, %cl
cmpb ,%cl
je 1f # the first CPU calls start_kernel
# all other CPUs call initialize_secondary
call initialize_secondary
jmp L6
1:
#endif /* CONFIG_SMP */
/*跳转到第二阶段的初始化,在init/main.c中定义*/
call start_kernel
L6:
jmp L6 # main should never return here, but
# just in case, we know what happens.
/*
* We depend on ET to be correct. This checks for 287/387.
*/
check_x87:
movb {fckeditor},X86_HARD_MATH
clts
fninit
fstsw %ax
cmpb {fckeditor},%al
je 1f
movl %cr0,%eax /* no coprocessor: have to set bits */
xorl ,%eax /* set EM */
movl %eax,%cr0
ret
ALIGN
1: movb ,X86_HARD_MATH
.byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */
ret
/*
* setup_idt
*
* sets up a idt with 256 entries pointing to
* ignore_int, interrupt gates. It doesn't actually load
* idt - that can be done only after paging has been enabled
* and the kernel moved to PAGE_OFFSET. Interrupts
* are enabled elsewhere, when we can be relatively
* sure everything is ok.
*
* Warning: %esi is live across this function.
*/
/*将初始的256个中断描述项设置设置相同的中断相应程序ignore_int*/
setup_idt:
lea ignore_int,%edx
movl $(__KERNEL_CS << 16),%eax
movw %dx,%ax /* selector = 0x0010 = cs */
movw 0x8E00,%dx /* interrupt gate - dpl=0, present */
lea idt_table,%edi
mov 6,%ecx
rp_sidt:
movl %eax,(%edi)
movl %edx,4(%edi)
addl ,%edi
dec %ecx
jne rp_sidt
ret
/* This is the default interrupt "handler" :-) */
/* 初始默认的中断相应函数,如果发生中断就在屏幕上打印错误信息,*/
ALIGN
ignore_int:
cld
#ifdef CONFIG_PRINTK
pushl %eax
pushl %ecx
pushl %edx
pushl %es
pushl %ds
movl $(__KERNEL_DS),%eax
movl %eax,%ds
movl %eax,%es
pushl 16(%esp)
pushl 24(%esp)
pushl 32(%esp)
pushl 40(%esp)
pushl $int_msg
call printk
addl $(5*4),%esp
popl %ds
popl %es
popl %edx
popl %ecx
popl %eax
#endif
iret
/*
* Real beginning of normal "text" segment
*/
ENTRY(stext)
ENTRY(_stext)
/*
* BSS section
*/
// BSS段,作为临时页表使用,image装入内存后,
swapper装载到物理地址的0x00101000,
empty_zero_page装载到物理地址的0x00104000.
实际他们的线性地址为0xC0100400和0x00101000.
.section ".bss.page_aligned","w"
ENTRY(swapper_pg_dir)
// 空1024x4个字节,用0填充
.fill 1024,4,0
ENTRY(empty_zero_page)
.fill 4096,1,0
/*
* This starts the data section.
*/
/*
.long的用法在GAS文档中是解释为和.int一样
.long expressions
.long is the same as ‘.int’, see Section 7.37 [.int], page 42.
那么.int expressions的英文原注解如下:
Expect zero or more expressions, of any section, separated by commas. For each expression,
emit a number that, at run time, is the value of that expression. The byte order and
bit size of the number depends on what kind of target the assembly is for.
将.long后面的表达式在运行时的值赋给前面的全局变量
*/
.data
ENTRY(stack_start)
/* 空余出8K的地址空间,THREAD_SIZE定义为8K长度,
init_thread_union定义在.data.init_thread section*/
.long init_thread_union+THREAD_SIZE
// __BOOT_DS = (GDT_ENTRY_BOOT_CS + 1) * 8 = 3*8
.long __BOOT_DS
ready: .byte 0
int_msg:
.asciz "Unknown interrupt or fault at EIP %p %p %p\n"
/*
* The IDT and GDT 'descriptors' are a strange 48-bit object
* only used by the lidt and lgdt instructions. They are not
* like usual segment descriptors - they consist of a 16-bit
* segment size, and 32-bit linear address value:
*/
.globl boot_gdt_descr
.globl idt_descr
.globl cpu_gdt_descr
ALIGN
# early boot GDT descriptor (must use 1:1 address mapping)
.word 0 # 32 bit align gdt_desc.address
boot_gdt_descr:
.word __BOOT_DS+7
.long boot_gdt_table - __PAGE_OFFSET
.word 0 # 32-bit align idt_desc.address
idt_descr:
.word IDT_ENTRIES*8-1 # idt contains 256 entries
.long idt_table
# boot GDT descriptor (later on used by CPU#0):
.word 0 # 32 bit align gdt_desc.address
cpu_gdt_descr:
.word GDT_ENTRIES*8-1
.long cpu_gdt_table
.fill NR_CPUS-1,8,0 # space for the other GDT descriptors
/*
* The boot_gdt_table must mirror the equivalent in setup.S and is
* used only for booting.
*/
.align L1_CACHE_BYTES
ENTRY(boot_gdt_table)
.fill GDT_ENTRY_BOOT_CS,8,0
.quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */
.quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */
/*
* The Global Descriptor Table contains 28 quadwords, per-CPU.
*/
.align PAGE_SIZE_asm
ENTRY(cpu_gdt_table)
.quad 0x0000000000000000 /* NULL descriptor */
.quad 0x0000000000000000 /* 0x0b reserved */
.quad 0x0000000000000000 /* 0x13 reserved */
.quad 0x0000000000000000 /* 0x1b reserved */
.quad 0x0000000000000000 /* 0x20 unused */
.quad 0x0000000000000000 /* 0x28 unused */
.quad 0x0000000000000000 /* 0x33 TLS entry 1 */
.quad 0x0000000000000000 /* 0x3b TLS entry 2 */
.quad 0x0000000000000000 /* 0x43 TLS entry 3 */
.quad 0x0000000000000000 /* 0x4b reserved */
.quad 0x0000000000000000 /* 0x53 reserved */
.quad 0x0000000000000000 /* 0x5b reserved */
.quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */
.quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */
.quad 0x00cffa000000ffff /* 0x73 user 4GB code at 0x00000000 */
.quad 0x00cff2000000ffff /* 0x7b user 4GB data at 0x00000000 */
.quad 0x0000000000000000 /* 0x80 TSS descriptor */
.quad 0x0000000000000000 /* 0x88 LDT descriptor */
/* Segments used for calling PnP BIOS */
.quad 0x00c09a0000000000 /* 0x90 32-bit code */
.quad 0x00809a0000000000 /* 0x98 16-bit code */
.quad 0x0080920000000000 /* 0xa0 16-bit data */
.quad 0x0080920000000000 /* 0xa8 16-bit data */
.quad 0x0080920000000000 /* 0xb0 16-bit data */
/*
* The APM segments have byte granularity and their bases
* and limits are set at run time.
*/
.quad 0x00409a0000000000 /* 0xb8 APM CS code */
.quad 0x00009a0000000000 /* 0xc0 APM CS 16 code (16 bit) */
.quad 0x0040920000000000 /* 0xc8 APM DS data */
.quad 0x0000920000000000 /* 0xd0 - ESPFIX 16-bit SS */
.quad 0x0000000000000000 /* 0xd8 - unused */
.quad 0x0000000000000000 /* 0xe0 - unused */
.quad 0x0000000000000000 /* 0xe8 - unused */
.quad 0x0000000000000000 /* 0xf0 - unused */
.quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */