荔园在线
荔园之美,在春之萌芽,在夏之绽放,在秋之收获,在冬之沉淀
[回到开始]
[上一篇][下一篇]
发信人: zzt (好好学习,天天向上), 信区: Linux
标 题: [转]读Linux Kernel心得
发信站: BBS 荔园晨风站 (Mon Jan 24 17:21:11 2000), 转信
【 以下文字转载自 zzt 的信箱 】
【 原文由 zzt.bbs@bbs.zd.dhs.org 所发表 】
作者: fuse (保险丝) 站内: Linux
标题: [转]读Linux Kernel心得
时间: Tue Dec 28 18:43:22 1999
转载自:http://bricks.net.dlut.edu.cn/linux/kernel/linux_kernel.txt
Bricks Group http://bricks.yeah.net/
=======================================
以下是我在读kernel时的一点心得, 由于我也是刚刚开始, 所以, 以下内容仅供参考.
所读内核版本为2.2.10.
主要的参考资料为TLK(The Linux Kernel)
一. Linux的内存分配
-----------------
1. 页面级分配器
-------------
Linux使用Buddy算法有效的分配和释放内存页块, 页分配代码一般分配一个和多个
物理页组成的块. 使用2的幂数大小的块.
struct free_area_struct {
struct page *next;
struct page *prev;
unsigned int * map;
};
static struct free_area_struct free_area[NR_MEM_LISTS];
typedef struct page {
/* these must be first (free area handling) */
struct page *next;
struct page *prev;
struct inode *inode;
unsigned long offset;
struct page *next_hash;
atomic_t count;
/* atomic flags, some possibly updated asynchronously */
unsigned long flags;
struct wait_queue *wait;
struct page **pprev_hash;
struct buffer_head * buffers;
} mem_map_t;
free_area
-----
| 5 |
-----
mem_map_t<-----| 4 |----> map (位图)
|| ----->| |
|| -----
|| | 3 |
mem_map_t -----
2. 内核内存分配器(KMA)
-------------------
用于处理动态的内存分配请求.
使用了先进的Slab分配机制(它首先在Solaris中采用). 使用kmalloc和kfree
来分配和释放内存, 其原形如下:
void * kmalloc(size_t size, int flags);
void kfree(const void *objp);
kmalloc的flags可以为下列值.
#define __GFP_WAIT 0x01
#define __GFP_LOW 0x02
#define __GFP_MED 0x04
#define __GFP_HIGH 0x08
#define __GFP_IO 0x10
#define __GFP_SWAP 0x20
#define __GFP_DMA 0x80
#define GFP_BUFFER (__GFP_LOW | __GFP_WAIT)
#define GFP_ATOMIC (__GFP_HIGH)
#define GFP_USER (__GFP_LOW | __GFP_WAIT | __GFP_IO)
#define GFP_KERNEL (__GFP_MED | __GFP_WAIT | __GFP_IO)
#define GFP_NFS (__GFP_HIGH | __GFP_WAIT | __GFP_IO)
#define GFP_KSWAPD (__GFP_IO | __GFP_SWAP)
/* Flag - indicates that the buffer will be suitable for DMA. Ignored
on some
platforms, used as appropriate on others */
#define GFP_DMA __GFP_DMA
KMA从页面级分配器取得内存页面, 它页可以将页面交还页面级分配器.
二. 进程调度
------------
struct task_struct中与进程调度有关的部分如下:
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
unsigned long flags; /* per process flags, defined below */
long need_resched;
long counter; 这时进程可以运行的时间量, 启动时等于priority, 它是以时标(
tick)单位.
long priority; 进程的优先级, 也是它允许运行的时间量(jiffies).
cycles_t avg_slice;
/* SMP and runqueue state */
int has_cpu;
int processor;
int last_processor;
int lock_depth;
unsigned long policy; 进程的调度策略: 普通的和实时的. 实时的有两种环或先
进先出.
SCHED_OTHER SCHED_RR(round robin) SCHED_FIFO
unsigned long rt_priority; 实时进程的相对优先级
unsigned long it_real_value, it_prof_value, it_virt_value;
unsigned long it_real_incr, it_prof_incr, it_virt_incr;
struct timer_list real_timer;
struct tms times;
do_timer在每一个clock tick会被调用.
//defined in sched.c
void do_timer(struct pt_regs * regs)
{
//将jiffies加一
(*(unsigned long *)&jiffies)++;
lost_ticks++;
//标记时钟的底半处理
mark_bh(TIMER_BH);
if (!user_mode(regs))
lost_ticks_system++;
//处理callout function
if (tq_timer)
mark_bh(TQUEUE_BH);
}
// callout function structure
struct tq_struct {
struct tq_struct *next;
unsigned long sync;
void (* routine)(void *);
void *data;
};
typedef struct tq_struct *task_queue;
task_queue tq_time, tq_immedidate, tq_scheduler, tq_disk;
~~~~
//时钟的底半处理
static void timer_bh(void)
{
update_times();
run_old_timers();
run_timer_list();
}
在update_times会更新当前进程的counter, 并处理当前进程的
unsigned long it_real_value, it_prof_value, it_virt_value;
unsigned long it_real_incr, it_prof_incr, it_virt_incr;
struct timer_list real_timer;
struct tms times;
在处理struct tms times时, 可能会发送 SIGXCPU, SIGKILL.
在处理it_prof_value, it_virt_value时, 可能会发送 SIGXVTALRM, SIGPROF.
When a system call completes, or when a ``slow'' interrupt completes,
ret_from_sys_call() is called. It does a bit of work, but all we care
about are two lines:
cmpl $0, need_resched(%ebx)
jne reschedule
reschedule:
call SYMBOL_NAME(schedule) # test
jmp ret_from_sys_call
下面来看一下在schedule中是如何进行调度的:
先看一下goodness这个函数, 它返回一个进程的'重量', 该值用于指明那
一个进程最应先运行(越大越好). 对于一个普通进程它简单的返回其counter, 对
于一个实时进程它返回1000+rt_priority. 一个进程的priority和rt_priority都
是不变化的.
它重新计算counter的方式很简单:
counter = counter/2 + priority;
struct task_struct init_task;
整个的runqueue是一个struct task_struct的环行双向链表, init_task在其中起
了很重要的作用. 初始化的时候是, 只有一个init_task, 它构成了一个封闭的环
自己连自己.
init_task
-----------------------------
| prev_run = &init_task; |
| next_run = &init_task; |
-----------------------------
加入了另一个进程p1之后:
init_task p1
-------------------------- --------------------------
| prev_run = &init_task; | | prev_run = &init_task; |
| next_run = &p1; | | next_run = &init_task; |
-------------------------- --------------------------
加入了另一个进程p2之后:
init_task p1
-------------------------- --------------------------
| prev_run = &init_task; | | prev_run = &p2; |
| next_run = &p2; | | next_run = &init_task; |
-------------------------- --------------------------
p2
--------------------------
| prev_run = &init_task; |
| next_run = &p1; |
--------------------------
有两个函数用于操作task_struct链表
add_to_runqueue 加一个在init_task的后面.
del_from_runqueue 从init_task的后面减掉一个.
move_last_runqueue
move_first_runqueue
这两个函数是将指定的进程移动到init_task的前面或后面.
由于, init_task在里面的作用很大, 进程调度的时候, 也是首先选择
init_task->next_run. 所以, 下面提到runqueue的头是指init_task->next_run.
有两个系统调用可以用来设置, 进程的policy.
sched_setparam
sched_getparam
struct sched_param {
int sched_priority;
};
Linux通过将所有可运行的进程连成一个链表来处理.
注意: Linux中只有一个链表. 毫无疑问, 在大负载下, 它将是低效的.
schedule的源程序:
asmlinkage void schedule(void)
{
struct schedule_data * sched_data;
struct task_struct *prev, *next, *p;
int this_cpu, c;
//处理tq_scheduler队列
if (tq_scheduler)
goto handle_tq_scheduler;
tq_scheduler_back:
prev = current;
this_cpu = prev->processor;
if (in_interrupt())
goto scheduling_in_interrupt;
release_kernel_lock(prev, this_cpu);
/* Do "administrative" work here while we don't hold any locks */
//底半处理
if (bh_mask & bh_active)
goto handle_bh;
handle_bh_back:
/*
* 'sched_data' is protected by the fact that we can run
* only one process per CPU.
*/
sched_data = & aligned_data[this_cpu].schedule_data;
spin_lock_irq(&runqueue_lock);
/* move an exhausted RR process to be last.. */
//处理实时的环带结构
if (prev->policy == SCHED_RR)
/*
if (!prev->counter) {
prev->counter = prev->priority;
move_last_runqueue(prev);
}
*/
goto move_rr_last;
move_rr_back:
//如果有信号, 则处理它
switch (prev->state) {
case TASK_INTERRUPTIBLE:
if (signal_pending(prev)) {
prev->state = TASK_RUNNING;
break;
}
default:
del_from_runqueue(prev);
case TASK_RUNNING:
}
prev->need_resched = 0;
repeat_schedule:
/*
* this is the scheduler proper:
*/
p = init_task.next_run;
/* Default process to select.. */
next = idle_task(this_cpu);
c = -1000;
if (prev->state == TASK_RUNNING)
//将c设置位prev的weight, next = prev
goto still_running;
still_running_back:
/*
* This is subtle.
* Note how we can enable interrupts here, even
* though interrupts can add processes to the run-
* queue. This is because any new processes will
* be added to the front of the queue, so "p" above
* is a safe starting point.
* run-queue deletion and re-ordering is protected by
* the scheduler lock
*/
/*
* Note! there may appear new tasks on the run-queue during this, as
* interrupts are enabled. However, they will be put on front of the
* list, so our list starting at "p" is essentially fixed.
*/
//看一看是该运行那一个, 找出最'重'的一个
while (p != &init_task) {
if (can_schedule(p)) {
int weight = goodness(prev, p, this_cpu);
if (weight > c)
c = weight, next = p;
}
p = p->next_run;
}
/* Do we need to re-calculate counters? */
if (!c)
/*
它完成了如下的工作 :
for_each_task(p)
p->counter = (p->counter >> 1) + p->priority;
*/
goto recalculate;
/*
* from this point on nothing can prevent us from
* switching to the next task, save this fact in
* sched_data.
*/
sched_data->curr = next;
#ifdef __SMP__
next->has_cpu = 1;
next->processor = this_cpu;
#endif
spin_unlock_irq(&runqueue_lock);
if (prev == next)
goto same_process;
#ifdef __SMP__
/*
* maintain the per-process 'average timeslice' value.
* (this has to be recalculated even if we reschedule to
* the same process) Currently this is only used on SMP,
* and it's approximate, so we do not have to maintain
* it while holding the runqueue spinlock.
*/
{
cycles_t t, this_slice;
t = get_cycles();
this_slice = t - sched_data->last_schedule;
sched_data->last_schedule = t;
/*
* Exponentially fading average calculation, with
* some weight so it doesnt get fooled easily by
* smaller irregularities.
*/
prev->avg_slice = (this_slice*1 + prev->avg_slice*1)/2;
}
/*
* We drop the scheduler lock early (it's a global spinlock),
* thus we have to lock the previous process from getting
* rescheduled during switch_to().
*/
#endif /* __SMP__ */
kstat.context_swtch++;
get_mmu_context(next);
switch_to(prev, next, prev);
__schedule_tail(prev);
same_process:
reacquire_kernel_lock(current);
return;
recalculate:
{
struct task_struct *p;
spin_unlock_irq(&runqueue_lock);
read_lock(&tasklist_lock);
for_each_task(p)
p->counter = (p->counter >> 1) + p->priority;
read_unlock(&tasklist_lock);
spin_lock_irq(&runqueue_lock);
goto repeat_schedule;
}
still_running:
c = prev_goodness(prev, prev, this_cpu);
next = prev;
goto still_running_back;
handle_bh:
do_bottom_half();
goto handle_bh_back;
handle_tq_scheduler:
run_task_queue(&tq_scheduler);
goto tq_scheduler_back;
move_rr_last:
if (!prev->counter) {
prev->counter = prev->priority;
move_last_runqueue(prev);
}
goto move_rr_back;
scheduling_in_interrupt:
printk("Scheduling in interrupt\n");
*(int *)0 = 0;
return;
}
在init_task中被初始化成如下的样子:
struct task_struct *next_task, *prev_task;
struct task_struct *next_run, *prev_run;
这四个都等于 &init_task
pid = 0;
policy = SCHED_OTHER;
priority = 20 * HZ /100;
#define NR_TASKS 512
struct task_struct * task[NR_TASKS];
sched_init()
{
int nr = NR_TASKS;
/* Init task array free list and pidhash table. */
while(--nr > 0)
add_free_taskslot(&task[nr]);
for(nr = 0; nr < PIDHASH_SZ; nr++)
pidhash[nr] = NULL;
}
struct task_struct **tarray_freelist;
extern __inline__ void add_free_taskslot(struct task_struct **t)
{
spin_lock(&taskslot_lock);
*t = (struct task_struct *) tarray_freelist;
tarray_freelist = t;
spin_unlock(&taskslot_lock);
}
free task始终是一个单向链表, 头指针为tarray_freelist.
task的初始状态如下:
------- tarray_freelist
|
----------------------------------------------
| &init_task | pointer | pointer | pointer |
----------------------------------------------
| | | | | | |
|--------| |--------| |-------| |-----
extern __inline__ struct task_struct **get_free_taskslot(void)
{
struct task_struct **tslot;
spin_lock(&taskslot_lock);
if((tslot = tarray_freelist) != NULL)
tarray_freelist = (struct task_struct **) *tslot;
spin_unlock(&taskslot_lock);
return tslot;
}
#define MIN_TASKS_LEFT_FOR_ROOT 4
在do_fork中, 初始化task_struct:
struct task_struct * p;
p->state = TASK_RUNNING;
p->next_run = p;
p->prev_run = p;
p->p_pptr = p->p_opptr = current;
p->p_cptr = NULL;
init_waitqueue(&p->wait_chldexit);
p->vfork_sem = NULL;
p->sigpending = 0;
sigemptyset(&p->signal);
p->sigqueue = NULL;
p->sigqueue_tail = &p->sigqueue;
p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0;
init_timer(&p->real_timer);
p->real_timer.data = (unsigned long) p;
p->leader = 0; /* session leadership doesn't inherit */
p->tty_old_pgrp = 0;
p->times.tms_utime = p->times.tms_stime = 0;
p->times.tms_cutime = p->times.tms_cstime = 0;
p->lock_depth = -1; /* -1 = no lock */
p->start_time = jiffies;
p->semundo = NULL;
/* ok, now we should be set up.. */
p->swappable = 1;
p->exit_signal = clone_flags & CSIGNAL;
p->pdeath_signal = 0;
/*
* "share" dynamic priority between parent and child, thus the
* total amount of dynamic priorities in the system doesnt change,
* more scheduling fairness. This is only important in the first
* timeslice, on the long run the scheduling behaviour is unchanged.
*/
current->counter >>= 1;
p->counter = current->counter;
nr_tasks++;
if (p->user)
atomic_inc(&p->user->count);
p->next_run = NULL;
p->prev_run = NULL;
wake_up_process(p); /* do this last */
下面是如何唤醒一个进程
void wake_up_process(struct task_struct * p)
{
unsigned long flags;
/*
* We want the common case fall through straight, thus the goto.
*/
spin_lock_irqsave(&runqueue_lock, flags);
p->state = TASK_RUNNING;
if (p->next_run)
goto out;
/* 将其加入到runqueue的头 */
add_to_runqueue(p);
spin_unlock_irqrestore(&runqueue_lock, flags);
reschedule_idle(p);
return;
out:
spin_unlock_irqrestore(&runqueue_lock, flags);
}
--
★ 万世留名我不希罕 倒不介意留下一些芳菲一些香
^|^ 不染不沾不湿 不即不倚不忘 只想轻轻松松活一场
〉 浮生独来独去独往 倒不介意 留下 一番 情意几番狂
--
※ Origin: 笑 书 亭 <bbs.zd.dhs.org>
◆ From: 210.32.151.168
--
※ 转载:·BBS 荔园晨风站 bbs.szu.edu.cn·[FROM: 192.168.1.11]
[回到开始]
[上一篇][下一篇]
荔园在线首页 友情链接:深圳大学 深大招生 荔园晨风BBS S-Term软件 网络书店