Category Archives: kernel

在虚拟机中编译内核

在虚拟机中编译内核

1. 下载内核源码

 
    #cd  /usr/src
    #wget  http://www.kernel.org/pub/linux/kernel/v2.6/linux-2.6.29.tar.gz

2. 解压内核源码

 
    #cd  /usr/src
    #tar  zxvf  linux-2.6.29.tar.gz
    #ln  -s  linux-2.6.29  linux

3. 给内核源码打补丁(可选)

 
    #cd  /usr/src/linux
    #patch  -p1  < /…/…/xxx.patch            打上补丁

4. 配置内核 用lspci命令,查看主板上的芯片,要注意硬盘是哪种类型、南桥芯片、SCSI卡型号、网卡、声卡等。

4.1 使用默认配置#make defconfig
4.2 或者使用当前工作内核的配置文件做为新内核配置文件的基础是个很好的注意。
 
    #cp  /boot/config-`uname -r`   /usr/src/linux/.config
然后进行配置
 
    #make  menuconfig
4.3 然后我们选择”Load an Alternate Configuration FIle”,然后选择”.config”文件作为配置文件。
4.4 配置完后,要选择 "Save an Alternate Configuration FIle".

5. 修改内核版本号#cd /usr/src/linux

 
    #vim Makefile
改成”EXTRVERSION =  .zhu.scst2.0.0.1”

6. 编译内核#make clean

 
    #make bzImage
所获得内核的位置在/usr/src/linux/arch/x86/boot/目录下。

7. 编译内核模块

 
    #make  modules

8. 安装内核模块

 
    #make  modules_install
将编译后的模块转移到系统标准位置。

9. 安装内核

 
    #make install

这条命令将:

  • 把压缩的内核映像拷贝到/boot目录下;
  • 调用mkinitrd程序创建内核的initrd映像;
  • 对GRUB而言,将修改/boot/grub/grub.conf配置文件;

Linux内核中等待队列的源码分析

等待队列就是一个进程列表,其中包含了等待某个特定事件的所有进程。

在Linux中,一个等待队列通过一个等待队列头(wait queue head)来管理,等待队列头是一个类型为wait_queue_head_t的结构体,定义在<linux/wait.h>中。

静态定义并初始化一个等待队列头:

DECLARE_WAIT_QUEUE_HEAD(name);

使用动态的方法:

wait_queue_head_t  my_queue;
init_waitqueue_head(&my_queue);

当进程休眠时,它将期待某个条件会在未来成为真。当一个休眠进程被唤醒时,它必须再次检查它所等待的条件的确为真。

Linux内核中最简单的休眠方式时成为wait_event的宏。

wait_event_interruptible(queue, condition)  //queue是等待队列头,它通过值传递,而不是通过指针。condition是布尔表达式

整个过程的另外一半是唤醒,其他的某个执行线程(可能是另一个进程或者中断处理例程)必须为我们执行唤醒。

wake_up_interruptible(wait_queue_head_t *queue);

在wait.h 中

wait_queue_head_t的定义如下所示

 
struct __wait_queue_head {
    spinlock_t lock;        //自旋锁
    struct list_head task_list;    //链表
};
typedef struct __wait_queue_head wait_queue_head_t;

wait_queue_t的定义如下所示

 
typedef struct __wait_queue wait_queue_t;
typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int sync, void *key);
int default_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);

struct __wait_queue {
    unsigned int flags;
#define WQ_FLAG_EXCLUSIVE    0x01
    void *private;    
    wait_queue_func_t func;
    struct list_head task_list;    //链表节点
};
 

等待队列头wait_queue_head_t的初始化

 
#define __WAIT_QUEUE_HEAD_INITIALIZER(name) {                \
    .lock        = __SPIN_LOCK_UNLOCKED(name.lock),        \
    .task_list    = { &(name).task_list, &(name).task_list } }

#define DECLARE_WAIT_QUEUE_HEAD(name) \
    wait_queue_head_t name = __WAIT_QUEUE_HEAD_INITIALIZER(name)

等待队列wait_queue_t的初始化

 
#define __WAITQUEUE_INITIALIZER(name, tsk) {                \
    .private    = tsk,                        \
    .func        = default_wake_function,            \
    .task_list    = { NULL, NULL } }

#define DECLARE_WAITQUEUE(name, tsk)                    \
    wait_queue_t name = __WAITQUEUE_INITIALIZER(name, tsk)
#define DEFINE_WAIT(name)                        \
    wait_queue_t name = {                        \
        .private    = current,                \        //保存当前进程的描述符
        .func        = autoremove_wake_function,        \
        .task_list    = LIST_HEAD_INIT((name).task_list),    \
    }
int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
    int ret = default_wake_function(wait, mode, sync, key);

    if (ret)
        list_del_init(&wait->task_list);
    return ret;
}
int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
              void *key)
{
    return try_to_wake_up(curr->private, mode, sync);
}
/***
 * try_to_wake_up - wake up a thread
 * @p: the to-be-woken-up thread
 * @state: the mask of task states that can be woken
 * @sync: do a synchronous wakeup?
 *
 * Put it on the run-queue if it's not already there. The "current"
 * thread is always on the run-queue (except when the actual
 * re-schedule is in progress), and as such you're allowed to do
 * the simpler "current->state = TASK_RUNNING" to mark yourself
 * runnable without the overhead of this.
 *
 * returns failure only if the task is already active.
 */
static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
{
    int cpu, orig_cpu, this_cpu, success = 0;
    unsigned long flags;
    long old_state;
    struct rq *rq;

    if (!sched_feat(SYNC_WAKEUPS))
        sync = 0;

#ifdef CONFIG_SMP
    if (sched_feat(LB_WAKEUP_UPDATE)) {
        struct sched_domain *sd;

        this_cpu = raw_smp_processor_id();
        cpu = task_cpu(p);

        for_each_domain(this_cpu, sd) {
            if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
                update_shares(sd);
                break;
            }
        }
    }
#endif

    smp_wmb();
    rq = task_rq_lock(p, &flags);
    update_rq_clock(rq);
    old_state = p->state;
    if (!(old_state & state))
        goto out;

    if (p->se.on_rq)
        goto out_running;

    cpu = task_cpu(p);
    orig_cpu = cpu;
    this_cpu = smp_processor_id();

#ifdef CONFIG_SMP
    if (unlikely(task_running(rq, p)))
        goto out_activate;

    cpu = p->sched_class->select_task_rq(p, sync);
    if (cpu != orig_cpu) {
        set_task_cpu(p, cpu);
        task_rq_unlock(rq, &flags);
        /* might preempt at this point */
        rq = task_rq_lock(p, &flags);
        old_state = p->state;
        if (!(old_state & state))
            goto out;
        if (p->se.on_rq)
            goto out_running;

        this_cpu = smp_processor_id();
        cpu = task_cpu(p);
    }

#ifdef CONFIG_SCHEDSTATS
    schedstat_inc(rq, ttwu_count);
    if (cpu == this_cpu)
        schedstat_inc(rq, ttwu_local);
    else {
        struct sched_domain *sd;
        for_each_domain(this_cpu, sd) {
            if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
                schedstat_inc(sd, ttwu_wake_remote);
                break;
            }
        }
    }
#endif /* CONFIG_SCHEDSTATS */

out_activate:
#endif /* CONFIG_SMP */
    schedstat_inc(p, se.nr_wakeups);
    if (sync)
        schedstat_inc(p, se.nr_wakeups_sync);
    if (orig_cpu != cpu)
        schedstat_inc(p, se.nr_wakeups_migrate);
    if (cpu == this_cpu)
        schedstat_inc(p, se.nr_wakeups_local);
    else
        schedstat_inc(p, se.nr_wakeups_remote);
    activate_task(rq, p, 1);
    success = 1;

out_running:
    trace_sched_wakeup(rq, p, success);
    check_preempt_curr(rq, p, sync);

    p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
    if (p->sched_class->task_wake_up)
        p->sched_class->task_wake_up(rq, p);
#endif
out:
    current->se.last_wakeup = current->se.sum_exec_runtime;

    task_rq_unlock(rq, &flags);

    return success;
}

wait_event_interruptible函数的定义如下所示

 
#define wait_event_interruptible(wq, condition)                \
({                                    \
    int __ret = 0;                            \
    if (!(condition))                        \
        __wait_event_interruptible(wq, condition, __ret);    \
    __ret;                                \
})
 
 
#define __wait_event_interruptible(wq, condition, ret)            \
do {                                    \
    DEFINE_WAIT(__wait);                        \
                                    \
    for (;;) {                            \
        prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE);    \
        if (condition)                        \
            break;                        \
        if (!signal_pending(current)) {                \
            schedule();                    \
            continue;                    \
        }                            \
        ret = -ERESTARTSYS;                    \
        break;                            \
    }                                \
    finish_wait(&wq, &__wait);                    \
} while (0)
 
prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
{
    unsigned long flags;

    wait->flags &= ~WQ_FLAG_EXCLUSIVE;
    spin_lock_irqsave(&q->lock, flags);
    if (list_empty(&wait->task_list))
        __add_wait_queue(q, wait);
    set_current_state(state);
    spin_unlock_irqrestore(&q->lock, flags);
}
 
static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new)
{
    list_add(&new->task_list, &head->task_list);
}
 
void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
{
    unsigned long flags;

    __set_current_state(TASK_RUNNING);
    /*
     * We can check for list emptiness outside the lock
     * IFF:
     * - we use the "careful" check that verifies both
     * the next and prev pointers, so that there cannot
     * be any half-pending updates in progress on other
     * CPU's that we haven't seen yet (and that might
     * still change the stack area.
     * and
     * - all other users take the lock (ie we can only
     * have _one_ other CPU that looks at or modifies
     * the list).
     */
    if (!list_empty_careful(&wait->task_list)) {
        spin_lock_irqsave(&q->lock, flags);
        list_del_init(&wait->task_list);
        spin_unlock_irqrestore(&q->lock, flags);
    }
}
 

wake_up_interruptible(wait_queue_head_t *queue)函数的定义如下所示

 
#define wake_up_interruptible(x)    __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL)
 
 
void __wake_up(wait_queue_head_t *q, unsigned int mode,
            int nr_exclusive, void *key)
{
    unsigned long flags;

    spin_lock_irqsave(&q->lock, flags);
    __wake_up_common(q, mode, nr_exclusive, 0, key);
    spin_unlock_irqrestore(&q->lock, flags);
}
 
void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
            int nr_exclusive, int sync, void *key)
{
    wait_queue_t *curr, *next;

    list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
        unsigned flags = curr->flags;

        if (curr->func(curr, mode, sync, key) &&
                (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
            break;
    }
}

Linux内核的并发和竞态 信号量源码分析

在include/linux/semaphore.h中

 
/* Please don't access any members of this structure directly */
struct semaphore {
    spinlock_t        lock;
    unsigned int        count;
    struct list_head    wait_list;
};

#define __SEMAPHORE_INITIALIZER(name, n)                \
{                                    \
    .lock        = __SPIN_LOCK_UNLOCKED((name).lock),        \
    .count        = n,                        \
    .wait_list    = LIST_HEAD_INIT((name).wait_list),        \
}

#define DECLARE_MUTEX(name)    \
    struct semaphore name = __SEMAPHORE_INITIALIZER(name, 1)

static inline void sema_init(struct semaphore *sem, int val)
{
    static struct lock_class_key __key;
    *sem = (struct semaphore) __SEMAPHORE_INITIALIZER(*sem, val);
    lockdep_init_map(&sem->lock.dep_map, "semaphore->lock", &__key, 0);
}

#define init_MUTEX(sem)        sema_init(sem, 1)
#define init_MUTEX_LOCKED(sem)    sema_init(sem, 0)

在kernel/semaphore.c中

 
/**
 * down - acquire the semaphore
 * @sem: the semaphore to be acquired
 *
 * Acquires the semaphore. If no more tasks are allowed to acquire the
 * semaphore, calling this function will put the task to sleep until the
 * semaphore is released.
 *
 * Use of this function is deprecated, please use down_interruptible() or
 * down_killable() instead.
 */
void down(struct semaphore *sem)
{
    unsigned long flags;

    spin_lock_irqsave(&sem->lock, flags);
    if (likely(sem->count > 0))
        sem->count--;
    else
        __down(sem);
    spin_unlock_irqrestore(&sem->lock, flags);
}
EXPORT_SYMBOL(down);

/**
 * down_interruptible - acquire the semaphore unless interrupted
 * @sem: the semaphore to be acquired
 *
 * Attempts to acquire the semaphore. If no more tasks are allowed to
 * acquire the semaphore, calling this function will put the task to sleep.
 * If the sleep is interrupted by a signal, this function will return -EINTR.
 * If the semaphore is successfully acquired, this function returns 0.
 */
int down_interruptible(struct semaphore *sem)
{
    unsigned long flags;
    int result = 0;

    spin_lock_irqsave(&sem->lock, flags);
    if (likely(sem->count > 0))
        sem->count--;
    else
        result = __down_interruptible(sem);
    spin_unlock_irqrestore(&sem->lock, flags);

    return result;
}
EXPORT_SYMBOL(down_interruptible);

/**
 * down_killable - acquire the semaphore unless killed
 * @sem: the semaphore to be acquired
 *
 * Attempts to acquire the semaphore. If no more tasks are allowed to
 * acquire the semaphore, calling this function will put the task to sleep.
 * If the sleep is interrupted by a fatal signal, this function will return
 * -EINTR. If the semaphore is successfully acquired, this function returns
 * 0.
 */
int down_killable(struct semaphore *sem)
{
    unsigned long flags;
    int result = 0;

    spin_lock_irqsave(&sem->lock, flags);
    if (likely(sem->count > 0))
        sem->count--;
    else
        result = __down_killable(sem);
    spin_unlock_irqrestore(&sem->lock, flags);

    return result;
}
EXPORT_SYMBOL(down_killable);

/**
 * down_trylock - try to acquire the semaphore, without waiting
 * @sem: the semaphore to be acquired
 *
 * Try to acquire the semaphore atomically. Returns 0 if the mutex has
 * been acquired successfully or 1 if it it cannot be acquired.
 *
 * NOTE: This return value is inverted from both spin_trylock and
 * mutex_trylock! Be careful about this when converting code.
 *
 * Unlike mutex_trylock, this function can be used from interrupt context,
 * and the semaphore can be released by any task or interrupt.
 */
int down_trylock(struct semaphore *sem)
{
    unsigned long flags;
    int count;

    spin_lock_irqsave(&sem->lock, flags);
    count = sem->count - 1;
    if (likely(count >= 0))
        sem->count = count;
    spin_unlock_irqrestore(&sem->lock, flags);

    return (count < 0);
}
EXPORT_SYMBOL(down_trylock);

/**
 * down_timeout - acquire the semaphore within a specified time
 * @sem: the semaphore to be acquired
 * @jiffies: how long to wait before failing
 *
 * Attempts to acquire the semaphore. If no more tasks are allowed to
 * acquire the semaphore, calling this function will put the task to sleep.
 * If the semaphore is not released within the specified number of jiffies,
 * this function returns -ETIME. It returns 0 if the semaphore was acquired.
 */
int down_timeout(struct semaphore *sem, long jiffies)
{
    unsigned long flags;
    int result = 0;

    spin_lock_irqsave(&sem->lock, flags);
    if (likely(sem->count > 0))
        sem->count--;
    else
        result = __down_timeout(sem, jiffies);
    spin_unlock_irqrestore(&sem->lock, flags);

    return result;
}
EXPORT_SYMBOL(down_timeout);

/**
 * up - release the semaphore
 * @sem: the semaphore to release
 *
 * Release the semaphore. Unlike mutexes, up() may be called from any
 * context and even by tasks which have never called down().
 */
void up(struct semaphore *sem)
{
    unsigned long flags;

    spin_lock_irqsave(&sem->lock, flags);
    if (likely(list_empty(&sem->wait_list)))
        sem->count++;
    else
        __up(sem);
    spin_unlock_irqrestore(&sem->lock, flags);
}
EXPORT_SYMBOL(up);

/* Functions for the contended case */

struct semaphore_waiter {
    struct list_head list;
    struct task_struct *task;
    int up;
};

/*
 * Because this function is inlined, the 'state' parameter will be
 * constant, and thus optimised away by the compiler. Likewise the
 * 'timeout' parameter for the cases without timeouts.
 */
static inline int __sched __down_common(struct semaphore *sem, long state,
                                long timeout)
{
    struct task_struct *task = current;
    struct semaphore_waiter waiter;

    list_add_tail(&waiter.list, &sem->wait_list);
    waiter.task = task;
    waiter.up = 0;

    for (;;) {
        if (signal_pending_state(state, task))
            goto interrupted;
        if (timeout <= 0)
            goto timed_out;
        __set_task_state(task, state);
        spin_unlock_irq(&sem->lock);
        timeout = schedule_timeout(timeout);
        spin_lock_irq(&sem->lock);
        if (waiter.up)
            return 0;
    }

 timed_out:
    list_del(&waiter.list);
    return -ETIME;

 interrupted:
    list_del(&waiter.list);
    return -EINTR;
}

static noinline void __sched __down(struct semaphore *sem)
{
    __down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
}

static noinline int __sched __down_interruptible(struct semaphore *sem)
{
    return __down_common(sem, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
}

static noinline int __sched __down_killable(struct semaphore *sem)
{
    return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT);
}

static noinline int __sched __down_timeout(struct semaphore *sem, long jiffies)
{
    return __down_common(sem, TASK_UNINTERRUPTIBLE, jiffies);
}

static noinline void __sched __up(struct semaphore *sem)
{
    struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
                        struct semaphore_waiter, list);
    list_del(&waiter->list);
    waiter->up = 1;
    wake_up_process(waiter->task);
}

Linux内核中 Poll的实现

先描述Poll函数的功能和定义

在APUE第384页中,有Poll函数的定义

#include <poll.h>
int poll(struct pollfd fdarray[], nfds_t nfds, int timeout);

返回值:准备就绪的描述符数,若超时则返回0,若出错则返回-1;

struct pollfd{
    int  fd;                //file descriptor to check, or < 0 to ignore
    short  events;    // events of interest on fd
    short revents;    // events that occurred on fd
};

nfds说明了fdarray数组中的元素数。

events 的标志有 POLLIN POLLOUT 等等

revents 的标志有 POLLIN POLLOUT POLLERR POLLHUP POLLNVAL等等

poll的最后一个参数说明我们愿意等待多少时间。

  • timeout == -1 永远等待。
  • timeout == 0 不等待,测试所有描述符并立即返回。
  • timeout > 0 等待timeout毫秒。

在linux/poll.h中定义了

 
typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);

typedef struct poll_table_struct {
    poll_queue_proc qproc;
} poll_table;

static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
    if (p && wait_address)
        p->qproc(filp, wait_address, p);
}

static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
{
    pt->qproc = qproc;
}

struct poll_table_entry {
    struct file *filp;                //文件描述符
    wait_queue_t wait;        //__pollwait对这个wait进行初始化,把它添加到等待队列中。
    wait_queue_head_t *wait_address;        //等待队列,一般是文件上的等待队列
};

/*
 * Structures and helpers for sys_poll/sys_poll
 */
struct poll_wqueues {
    poll_table pt;
    struct poll_table_page *table;     //poll_table_page的定义在select.c中,假如内部空间不够,可以用于分配空间
    struct task_struct *polling_task;
    int triggered;
    int error;
    int inline_index;
    struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];        //内部空间
};

extern void poll_initwait(struct poll_wqueues *pwq);
extern void poll_freewait(struct poll_wqueues *pwq);
extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
                 ktime_t *expires, unsigned long slack);

static inline int poll_schedule(struct poll_wqueues *pwq, int state)
{
    return poll_schedule_timeout(pwq, state, NULL, 0);
}
 
### poll_table_page的定义如下所示。
struct poll_table_page {
    struct poll_table_page * next;
    struct poll_table_entry * entry;
    struct poll_table_entry entries[0];
};

在fs/select.c中

do_sys_poll的定义如下所示

 
#define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list)) / \
            sizeof(struct pollfd))
#define POLLFD_PER_PAGE ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd))
 
int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
        struct timespec *end_time)
{
    struct poll_wqueues table;            //重要的数据结构
     int err = -EFAULT, fdcount, len, size;
    /* Allocate small arguments on the stack to save memory and be
       faster - use long to make sure the buffer is aligned properly
       on 64 bit archs to avoid unaligned access */
    long stack_pps[POLL_STACK_ALLOC/sizeof(long)];            //POLL_STACK_ALLOC = 256
    struct poll_list *const head = (struct poll_list *)stack_pps;        //poll_list 的定义在下面
     struct poll_list *walk = head;
     unsigned long todo = nfds;

    if (nfds > current->signal->rlim[RLIMIT_NOFILE].rlim_cur)    //进程打开文件数目的限制
        return -EINVAL;
    //先用栈中的空间,假如栈中的空间不够用,则使用堆中的空间。这样既能节省空间也能提高效率
   //把struct pollfd __user *ufds中的元素都复制到poll_list *head的链表中
    len = min_t(unsigned int, nfds, N_STACK_PPS);
    for (;;) {
        walk->next = NULL;
        walk->len = len;
        if (!len)
            break;

        if (copy_from_user(walk->entries, ufds + nfds-todo,
                    sizeof(struct pollfd) * walk->len))
            goto out_fds;

        todo -= walk->len;
        if (!todo)
            break;

        len = min(todo, POLLFD_PER_PAGE);
        size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
        walk = walk->next = kmalloc(size, GFP_KERNEL);
        if (!walk) {
            err = -ENOMEM;
            goto out_fds;
        }
    }
    poll_initwait(&table);         //初始化table , poll_iniwait定义在下面
    fdcount = do_poll(nfds, head, &table, end_time);    //do_poll的定义在下面
    poll_freewait(&table);        //处理table上的信息,得到事件信息,poll_freewait函数的定义在下面
    //把事件信息复制到用户空间上。
    for (walk = head; walk; walk = walk->next) {
        struct pollfd *fds = walk->entries;
        int j;

        for (j = 0; j < walk->len; j++, ufds++)
            if (__put_user(fds[j].revents, &ufds->revents))
                goto out_fds;
      }

    err = fdcount;
out_fds:
    walk = head->next;
    while (walk) {
        struct poll_list *pos = walk;
        walk = walk->next;
        kfree(pos);
    }

    return err;
}

poll_list 的定义如下所示

 
struct poll_list {
    struct poll_list *next;
    int len;                            //数组entries的长度
    struct pollfd entries[0];    //数组指针
};

poll_initwait的定义如下所示

 
void poll_initwait(struct poll_wqueues *pwq)
{
    init_poll_funcptr(&pwq->pt, __pollwait);
    pwq->polling_task = current;        //important
    pwq->error = 0;
    pwq->table = NULL;
    pwq->inline_index = 0;
}

__pollwait的定义如下所示

 
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
                poll_table *p)
{
    struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
    struct poll_table_entry *entry = poll_get_entry(pwq);
    if (!entry)
        return;
    get_file(filp);
    entry->filp = filp;
    entry->wait_address = wait_address;
    init_waitqueue_func_entry(&entry->wait, pollwake);    //init_waitqueue_func_entry函数的定义如下所示,pollwake函数的定义如下所示
    entry->wait.private = pwq;
    add_wait_queue(wait_address, &entry->wait);
}

init_waitqueue_func_entry的定义如下所示

 
static inline void init_waitqueue_func_entry(wait_queue_t *q,
                    wait_queue_func_t func)
{
    q->flags = 0;
    q->private = NULL;
    q->func = func;
}

poll_wake的定义如下所示

 
static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
    struct poll_wqueues *pwq = wait->private;
    DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);

    /*
     * Although this function is called under waitqueue lock, LOCK
     * doesn't imply write barrier and the users expect write
     * barrier semantics on wakeup functions. The following
     * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
     * and is paired with set_mb() in poll_schedule_timeout.
     */
    smp_wmb();
    pwq->triggered = 1;

    /*
     * Perform the default wake up operation using a dummy
     * waitqueue.
     *
     * TODO: This is hacky but there currently is no interface to
     * pass in @sync. @sync is scheduled to be removed and once
     * that happens, wake_up_process() can be used directly.
     */
    return default_wake_function(&dummy_wait, mode, sync, key);
}

default_wake_function的定义如下所示

 
int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
              void *key)
{
    return try_to_wake_up(curr->private, mode, sync);
}

poll_get_entry的定义如下所示

 
#define POLL_TABLE_FULL(table) \
    ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table))
 
static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
{
    struct poll_table_page *table = p->table;

    if (p->inline_index < N_INLINE_POLL_ENTRIES)    //先用内部的空间
        return p->inline_entries + p->inline_index++;

    if (!table || POLL_TABLE_FULL(table)) {            //空间不足,分配空间
        struct poll_table_page *new_table;

        new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
        if (!new_table) {
            p->error = -ENOMEM;
            return NULL;
        }
        new_table->entry = new_table->entries;
        new_table->next = table;
        p->table = new_table;
        table = new_table;
    }

    return table->entry++;        //使用分配的空间
}

poll_freewait的定义如下所示

 
static void free_poll_entry(struct poll_table_entry *entry)
{
    remove_wait_queue(entry->wait_address, &entry->wait);
    fput(entry->filp);
}
 
void poll_freewait(struct poll_wqueues *pwq)
{
    struct poll_table_page * p = pwq->table;
    int i;
    for (i = 0; i < pwq->inline_index; i++)
        free_poll_entry(pwq->inline_entries + i);
    while (p) {
        struct poll_table_entry * entry;
        struct poll_table_page *old;

        entry = p->entry;
        do {
            entry--;
            free_poll_entry(entry);
        } while (entry > p->entries);
        old = p;
        p = p->next;
        free_page((unsigned long) old);
    }
}

do_poll的定义如下所示

 
static int do_poll(unsigned int nfds, struct poll_list *list,
           struct poll_wqueues *wait, struct timespec *end_time)
{
    poll_table* pt = &wait->pt;
    ktime_t expire, *to = NULL;
    int timed_out = 0, count = 0;
    unsigned long slack = 0;

    /* Optimise the no-wait case */
    if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {//假如超时时间设置为0,则pt = NULL,poll调用就不用阻塞休眠
        pt = NULL;
        timed_out = 1;
    }

    if (end_time && !timed_out)
        slack = estimate_accuracy(end_time);

    for (;;) {
        struct poll_list *walk;

        for (walk = list; walk != NULL; walk = walk->next) {
            struct pollfd * pfd, * pfd_end;

            pfd = walk->entries;
            pfd_end = pfd + walk->len;
            for (; pfd != pfd_end; pfd++) {
                /*
                 * Fish for events. If we found one, record it
                 * and kill the poll_table, so we don't
                 * needlessly register any other waiters after
                 * this. They'll get immediately deregistered
                 * when we break out and return.
                 */
                if (do_pollfd(pfd, pt)) {
                    count++;                //假如找到一个事件,poll就不用阻塞了。
                    pt = NULL;            
                }
            }
        }
        /*
         * All waiters have already been registered, so don't provide
         * a poll_table to them on the next loop iteration.
         */
        pt = NULL;
        if (!count) {
            count = wait->error;
            if (signal_pending(current))
                count = -EINTR;
        }
        if (count || timed_out)    //假如检查到event发生或者是时间超时,则跳出循环
            break;

        /*
         * If this is the first loop and we have a timeout
         * given, then we convert to ktime_t and set the to
         * pointer to the expiry value.
         */
        if (end_time && !to) {
            expire = timespec_to_ktime(*end_time);
            to = &expire;
        }

        if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))    //进入休眠,等待被唤醒
            timed_out = 1;
    }
    return count;
}

do_pollfd的定义如下所示

 
/*
 * Fish for pollable events on the pollfd->fd file descriptor. We're only
 * interested in events matching the pollfd->events mask, and the result
 * matching that mask is both recorded in pollfd->revents and returned. The
 * pwait poll_table will be used by the fd-provided poll handler for waiting,
 * if non-NULL.
 */
static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
{
    unsigned int mask;
    int fd;

    mask = 0;
    fd = pollfd->fd;
    if (fd >= 0) {
        int fput_needed;
        struct file * file;

        file = fget_light(fd, &fput_needed);    //获取文件描述符
        mask = POLLNVAL;
        if (file != NULL) {
            mask = DEFAULT_POLLMASK;
            if (file->f_op && file->f_op->poll)
                mask = file->f_op->poll(file, pwait);    //执行设备驱动的poll函数,在poll函数中,可能调用poll_wait函数(在poll.h中定义)。
            /* Mask out unneeded events. */
            mask &= pollfd->events | POLLERR | POLLHUP;   //设置状态
            fput_light(file, fput_needed);
        }
    }
    pollfd->revents = mask;        //设置已发生事件的状态

    return mask;
}

LDD3 学习

  1. 编写scull模块, scull01.tar.gz,包括的函数有
  • scull_llseek,
  • scull_read,
  • scull_write,
  • scull_open,
  • scull_release.
  1. 添加proc接口,用于debug,查看驱动程序(设备)的状态,源代码是 scull02.tar.gz,新加了
  • scull_read_procmem,
  • scull_create_proc,
  • scull_remove_proc函数。
  1. 添加 seq_file(proc)的接口,解决proc的不足,源代码是scull03.tar.gz

  2. 添加ioctl调用,源代码是scull04.tar.gz

  3. 编写新的模块(LDD3中的第151页),实现了简单的休眠。源代码是sleepy.tar.gz

  4. 编写新的模块(LDD3中的第153页),实现了阻塞IO。源代码是piepe.tar.gz

  5. 在piepe.tar.gz的基础上添加了poll函数(LDD3中的第165页。源代码是piepe.02.tar.gz

  6. 编写时间、延迟及延缓操作模块。源码是myjit.tar.gz。

    #head -8  /proc/currentime             #获取当前时间
    #dd   bs=20  count=5  <  /proc/jitbusy    #忙等待
    #dd bs=20 count=5 < /proc/jitsched        #使用schedule()函数,让出CPU
    #dd bs=20 count=5 < /proc/jitschedto     #使用schedule_timeout()函数,等待超时
    #dd bs=20 count=5 < /proc/jitqueue         #使用wait_event_interruptible_timeout()函数,等待超时
    #cat  /proc/jitimer            #使用内核定时器
    #cat  /proc/jitasklet           #使用tasklet
    #cat /proc/jitasklethi         #使用tasklethi
  1. 编写内存分配模块,使用slab,源码是scullc.tar.gz。
  2. 学习中断处理,源码是shortp.tar.gz。
  3. 在shortp的基础上添加tasklet,源码是shortp02.tar.gz。
  4. 编写PCI驱动,源码是pci.tar.gz。
  5. 编写了一个总线类型、一个总线设备、一个总线驱动。源码是lddbus.tar.gz。
  6. 编写一个总线下的设备。源码是ldddevice.tar.gz。
  7. 编写scullv设备,该设备使用vmalloc获取内存空间。并完成mmap内存映射功能。主要是完成了fault函数。源码是scullv.tar.gz。
  8. 编写块设备驱动,就像ramdisk一样。源码是sbull.tar.gz。

Linux内核的并发和竞态 自旋锁源码分析

在include/linux/spinlock_types.h中

typedef struct{
    raw_spinlock_t  raw_lock;
    ...
}spinlock_t;
 
在arch/x86/include/asm/spinlock_types.h中
typedef struct raw_spinlock{
    unsigned int slock;            //初始值是0
}raw_spinlock_t;
 
在include/linux/spinlock.h中
#define  spin_lock(lock)    _spin_lock(lock)
#define spin_lock_bh(lock)  _spin_lock_bh(lock)
#define spin_lock_irq(lock) _spin_lock_irq(lock)
#if defined(CONFIG_SMP || defined(CONFIG_DEBUG_SPINLOCK))
#define spin_lock_irqsave(lock, flags)  \
    do{        \
        typecheck(unsigned long , flags);    \
        flags = _spin_lock_irqsave(lock);    \
    }while(0)
#endif
 
#define spin_unlock(lock) _spin_unlock(lock)
#define spin_unlock_bh(lock) _spin_unlock_bh(lock)
#define spin_unlock_irq(lock) _spin_unlock_irq(lock)
 
#define spin_unlock_irqsave(lock, flags) \
do{ \
typecheck(unsigned long , flags); \
 _spin_unlock_irqrestore(lock, flags); \
}while(0)
 

在kernel/spinlock.c中

void __lockfunc _spin_lock(spinlock_t *lock)
{
    preempt_disable();
    spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
    LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
}
 
void __lockfunc _spin_lock_bh(spinlock_t *lock)
{
    local_bh_disable();
    preempt_disable();
    spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
    LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
}
 
void __lockfunc _spin_lock_irq(spinlock_t *lock)
{
    local_irq_disable();
    preempt_disable();
    spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
    LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
}
 
unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
{
    unsigned long flags;

    local_irq_save(flags);
    preempt_disable();
    spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
    /*
     * On lockdep we dont want the hand-coded irq-enable of
     * _raw_spin_lock_flags() code, because lockdep assumes
     * that interrupts are not re-enabled during lock-acquire:
     */
#ifdef CONFIG_LOCKDEP
    LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
#else
    _raw_spin_lock_flags(lock, &flags);
#endif
    return flags;
}
void __lockfunc _spin_unlock(spinlock_t *lock)
{
    spin_release(&lock->dep_map, 1, _RET_IP_);
    _raw_spin_unlock(lock);
    preempt_enable();
}
 
void __lockfunc _spin_unlock_bh(spinlock_t *lock)
{
    spin_release(&lock->dep_map, 1, _RET_IP_);
    _raw_spin_unlock(lock);
    preempt_enable_no_resched();
    local_bh_enable_ip((unsigned long)__builtin_return_address(0));
}
 
void __lockfunc _spin_unlock_irq(spinlock_t *lock)
{
    spin_release(&lock->dep_map, 1, _RET_IP_);
    _raw_spin_unlock(lock);
    local_irq_enable();
    preempt_enable();
}
 
void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
{
    spin_release(&lock->dep_map, 1, _RET_IP_);
    _raw_spin_unlock(lock);
    local_irq_restore(flags);
    preempt_enable();
}
 

在include/linux/lockdep.h中

#define LOCK_CONTENDED(_lock, try, lock) \
    lock(_lock)
 

在linux/spinlock.h中

# define _raw_spin_lock(lock)        __raw_spin_lock(&(lock)->raw_lock)
 
在arch/x86/include/asm/spinlock.h中

static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
{
    __ticket_spin_lock(lock);
}
 
#if (NR_CPUS < 256)
#define TICKET_SHIFT 8

static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
{
    short inc = 0x0100;

    asm volatile (
        LOCK_PREFIX "xaddw %w0, %1\n"
        "1:\t"
        "cmpb %h0, %b0\n\t"
        "je 2f\n\t"
        "rep ; nop\n\t"
        "movb %1, %b0\n\t"
        /* don't need lfence here, because loads are in-order */
        "jmp 1b\n"
        "2:"
        : "+Q" (inc), "+m" (lock->slock)
        :
        : "memory", "cc");
}

static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
{
    int tmp, new;

    asm volatile("movzwl %2, %0\n\t"
             "cmpb %h0,%b0\n\t"
             "leal 0x100(%" REG_PTR_MODE "0), %1\n\t"
             "jne 1f\n\t"
             LOCK_PREFIX "cmpxchgw %w1,%2\n\t"
             "1:"
             "sete %b1\n\t"
             "movzbl %b1,%0\n\t"
             : "=&a" (tmp), "=&q" (new), "+m" (lock->slock)
             :
             : "memory", "cc");

    return tmp;
}

static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
{
    asm volatile(UNLOCK_LOCK_PREFIX "incb %0"
             : "+m" (lock->slock)
             :
             : "memory", "cc");
}
#else
#define TICKET_SHIFT 16

static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
{
    int inc = 0x00010000;
    int tmp;

    asm volatile(LOCK_PREFIX "xaddl %0, %1\n"
             "movzwl %w0, %2\n\t"
             "shrl $16, %0\n\t"
             "1:\t"
             "cmpl %0, %2\n\t"
             "je 2f\n\t"
             "rep ; nop\n\t"
             "movzwl %1, %2\n\t"
             /* don't need lfence here, because loads are in-order */
             "jmp 1b\n"
             "2:"
             : "+r" (inc), "+m" (lock->slock), "=&r" (tmp)
             :
             : "memory", "cc");
}

static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
{
    int tmp;
    int new;

    asm volatile("movl %2,%0\n\t"
             "movl %0,%1\n\t"
             "roll $16, %0\n\t"
             "cmpl %0,%1\n\t"
             "leal 0x00010000(%" REG_PTR_MODE "0), %1\n\t"
             "jne 1f\n\t"
             LOCK_PREFIX "cmpxchgl %1,%2\n\t"
             "1:"
             "sete %b1\n\t"
             "movzbl %b1,%0\n\t"
             : "=&a" (tmp), "=&q" (new), "+m" (lock->slock)
             :
             : "memory", "cc");

    return tmp;
}

static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
{
    asm volatile(UNLOCK_LOCK_PREFIX "incw %0"
             : "+m" (lock->slock)
             :
             : "memory", "cc");
}
#endif
 

Linux内核态的内存分配和内存映射的关系

在内存中,有两种资源。这两种资源都需要申请。

1. 物理内存空间
2. 虚拟地址

像分配内存函数kmalloc 和__get_free_page函数是在低端内存中分配物理内存空间,然后直接返回相对应的内核逻辑地址(低端内存都映射到内核逻辑地址上3G~4G)。

然而vmalloc函数是在高端内存中分配物理内存空间的,因为没有相对应的内核逻辑地址,所以需要再分配内核虚拟地址,然后修改页表,映射内核虚拟地址和物理内存空间,最后返回内核虚拟地址。

假设有一段物理内存是一个page结构数组,用于跟踪系统中的物理内存。设这段起始内存地址是START_ADDR。

struct page *mem_map = (struct page *)START_ADDR;

假如某个struct page *page指针指向这个page结构数组中的某个元素,则可以通过 page – mem_map得到这个元素的下标。而下标跟物理内存页相关联,进而得到物理内存地址。
也就是说根据 struct page的地址,能找到这个page所代表物理页的地址。

kmap()是主要用在高端存储器页框的内核映射中,一般是这么使用的:

  • 使用alloc_pages()在高端存储器区得到struct page结构,然后调用kmap(struct *page)在内核地址空间PAGE_OFFSET+896M之后的地址空间中(PKMAP_BASE到FIXADDR_STAR)建立永久映射(如果page结构对应的是低端物理内存的页,该函数仅仅返回该页对应的虚拟地址)
  • kmap()也可能引起睡眠,所以不能用在中断和持有锁的代码中
  • 不过kmap 只能对一个物理页进行分配,所以尽量少用。

使用kmap的原因:

  • 对于高端物理内存(896M之后),并没有和内核地址空间建立一一对应的关系(即虚拟地址=物理地址+PAGE_OFFSET这样的关系),所以不能使用get_free_pages()这样的页分配器进行内存的分配,而必须使用alloc_pages()这样的伙伴系统算法的接口得到struct *page结构,然后将其映射到内核地址空间,注意这个时候映射后的地址并非和物理地址相差PAGE_OFFSET.