惊群效应学习
- 1、下载文档前请自行甄别文档内容的完整性,平台不提供额外的编辑、内容补充、找答案等附加服务。
- 2、"仅部分预览"的文档,不可在线预览部分如存在完整性等问题,可反馈申请退款(可完整预览的文档不适用该条件!)。
- 3、如文档侵犯您的权益,请联系客服反馈,我们会尽快为您处理(人工客服工作时间:9:00-18:30)。
惊群效应学习
惊群效应:
举⼀个很简单的例⼦,当你往⼀群鸽⼦中间扔⼀块⾷物,虽然最终只有⼀个鸽⼦抢到⾷物,但所有鸽⼦都会被惊动来争夺,没有抢到⾷物的鸽⼦只好回去继续睡觉,等待下⼀块⾷物到来。
这样,每扔⼀块⾷物,都会惊动所有的鸽⼦,即为惊群。
对于操作系统来说,多个进程/线程在等待同⼀资源是,也会产⽣类似的效果,其结果就是每当资源可⽤,所有的进程/线程都来竞争资源,造成的后果
惊群导致问题:
1、上下⽂切换(context switch)过⾼会导致cpu像个搬运⼯,频繁地在寄存器和运⾏队列之间奔波,更多的时间花在了进程(线程)切换,⽽不是在真正⼯作的进程(线程)上⾯。
直接的消耗包括cpu寄存器要保存和加载(例如程序计数器)、系统调度器的代码需要执⾏。
间接的消耗在于多核cache之间的共享数据。
2、通过锁机制解决惊群效应是⼀种⽅法,在任意时刻只让⼀个进程(线程)处理等待的事件。
但是锁机制也会造成cpu等资源的消耗和性能损耗
1) accept惊群
主进程创建了socket、bind、listen之后,fork()出来多个进程,每个⼦进程都开始循环处理(accept)这个listen_fd。
每个进程都阻塞在accept上,当⼀个新的连接到来时候,所有的进程都会被唤醒,但是其中只有⼀个进程会接受成功,其余皆失败,重新休眠
这个程序模拟上⾯的场景,当我们⽤telnet连接该服务器程序时,会看到只返回⼀个进程pid,即只有⼀个进程被唤醒
1 #include<stdio.h>
2 #include<sys/types.h>
3 #include<sys/socket.h>
4 #include<unistd.h>
5 #include<sys/epoll.h>
6 #include<netdb.h>
7 #include<stdlib.h>
8 #include<fcntl.h>
9 #include<sys/wait.h>
10 #include<errno.h>
11#define PROCESS_NUM 10
12#define MAXEVENTS 64
13//socket创建和绑定
14int sock_creat_bind(char * port){
15int sock_fd = socket(AF_INET, SOCK_STREAM, 0);
16struct sockaddr_in serveraddr;
17 serveraddr.sin_family = AF_INET;
18 serveraddr.sin_port = htons(atoi(port));
19 serveraddr.sin_addr.s_addr = htonl(INADDR_ANY);
20
21 bind(sock_fd, (struct sockaddr *)&serveraddr, sizeof(serveraddr));
22return sock_fd;
23 }
24//利⽤fcntl设置⽂件或者函数调⽤的状态标志
25int make_nonblocking(int fd){
26int val = fcntl(fd, F_GETFL);
27 val |= O_NONBLOCK;
28if(fcntl(fd, F_SETFL, val) < 0){
29 perror("fcntl set");
30return -1;
31 }
32return0;
33 }
34
35int main(int argc, char *argv[])
36 {
37int sock_fd, epoll_fd;
38struct epoll_event event;
39struct epoll_event *events;
40
41if(argc < 2){
42 printf("usage: [port] %s", argv[1]);
43 exit(1);
44 }
45if((sock_fd = sock_creat_bind(argv[1])) < 0){
46 perror("socket and bind");
47 exit(1);
48 }
49if(make_nonblocking(sock_fd) < 0){
50 perror("make non blocking");
50 perror("make non blocking");
51 exit(1);
52 }
53if(listen(sock_fd, SOMAXCONN) < 0){
54 perror("listen");
55 exit(1);
56 }
57if((epoll_fd = epoll_create(MAXEVENTS))< 0){
58 perror("epoll_create");
59 exit(1);
60 }
61event.data.fd = sock_fd;
62event.events = EPOLLIN;
63if(epoll_ctl(epoll_fd, EPOLL_CTL_ADD, sock_fd, &event) < 0){
64 perror("epoll_ctl");
65 exit(1);
66 }
67/*buffer where events are returned*/
68 events = calloc(MAXEVENTS, sizeof(event));
69int i;
70for(i = 0; i < PROCESS_NUM; ++i){
71int pid = fork();
72if(pid == 0){
73while(1){
74int num, j;
75 num = epoll_wait(epoll_fd, events, MAXEVENTS, -1);
76 printf("process %d returnt from epoll_wait\n", getpid());
77 sleep(2);
78for(i = 0; i < num; ++i){
79if((events[i].events & EPOLLERR) || (events[i].events & EPOLLHUP) || (!(events[i].events & EPOLLIN))){
80 fprintf(stderr, "epoll error\n");
81 close(events[i].data.fd);
82continue;
83 }else if(sock_fd == events[i].data.fd){
84//收到关于监听套接字的通知,意味着⼀盒或者多个传⼊连接
85struct sockaddr in_addr;
86 socklen_t in_len = sizeof(in_addr);
87if(accept(sock_fd, &in_addr, &in_len) < 0){
88 printf("process %d accept failed!\n", getpid());
89 }else{
90 printf("process %d accept successful!\n", getpid());
91 }
92 }
93 }
94 }
95 }
96 }
97 wait(0);
98free(events);
99 close(sock_fd);
100return0;
101 }
fly@G480:~/fly/learn/test$ strace -f ./fork
execve("./fork", ["./fork"], 0x7fffd0d489d8 /* 61 vars */) = 0
brk(NULL) = 0x55e33c728000
arch_prctl(0x3001 /* ARCH_ */, 0x7fff1f212060) = -1 EINVAL (Invalid argument)
access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=102598, ...}) = 0
mmap(NULL, 102598, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f29c9075000
close(3) = 0
openat(AT_FDCWD, "/lib/x86_64-linux-gnu/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\360r\2\0\0\0\0\0"..., 832) = 832
lseek(3, 64, SEEK_SET) = 64
read(3, "\6\0\0\0\4\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0"..., 784) = 784
lseek(3, 848, SEEK_SET) = 848
read(3, "\4\0\0\0\20\0\0\0\5\0\0\0GNU\0\2\0\0\300\4\0\0\0\3\0\0\0\0\0\0\0", 32) = 32
lseek(3, 880, SEEK_SET) = 880
read(3, "\4\0\0\0\24\0\0\0\3\0\0\0GNU\0u\343\342\331Yj\256%\0230\256~\363\371\32\204"..., 68) = 68
fstat(3, {st_mode=S_IFREG|0755, st_size=2025032, ...}) = 0
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f29c9073000
lseek(3, 64, SEEK_SET) = 64
read(3, "\6\0\0\0\4\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0"..., 784) = 784
lseek(3, 848, SEEK_SET) = 848
read(3, "\4\0\0\0\20\0\0\0\5\0\0\0GNU\0\2\0\0\300\4\0\0\0\3\0\0\0\0\0\0\0", 32) = 32
lseek(3, 880, SEEK_SET) = 880
read(3, "\4\0\0\0\24\0\0\0\3\0\0\0GNU\0u\343\342\331Yj\256%\0230\256~\363\371\32\204"..., 68) = 68
mmap(NULL, 2032984, PROT_READ, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f29c8e82000
mmap(0x7f29c8ea7000, 1540096, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x25000) =
0x7f29c8ea7000
mmap(0x7f29c901f000, 303104, PROT_READ, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x19d000) = 0x7f29c901f000 mmap(0x7f29c9069000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1e6000) =
0x7f29c9069000
mmap(0x7f29c906f000, 13656, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) =
0x7f29c906f000
close(3) = 0
arch_prctl(ARCH_SET_FS, 0x7f29c9074540) = 0
mprotect(0x7f29c9069000, 12288, PROT_READ) = 0
mprotect(0x55e33bbdf000, 4096, PROT_READ) = 0
mprotect(0x7f29c90bb000, 4096, PROT_READ) = 0
munmap(0x7f29c9075000, 102598) = 0
socket(AF_INET, SOCK_STREAM, IPPROTO_IP) = 3
bind(3, {sa_family=AF_INET, sin_port=htons(1234), sin_addr=inet_addr("0.0.0.0")}, 16) = 0
listen(3, 1024) = 0
clone(child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7f29c9074810) = 5528 strace: Process 5528 attached
[pid 5527] clone( <unfinished ...>
[pid 5528] accept(3, NULL, NULLstrace: Process 5529 attached
<unfinished ...>
[pid 5527] <... clone resumed> child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD,
child_tidptr=0x7f29c9074810) = 5529
[pid 5527] clone( <unfinished ...>
[pid 5529] accept(3, NULL, NULLstrace: Process 5530 attached
<unfinished ...>
[pid 5527] <... clone resumed> child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD,
child_tidptr=0x7f29c9074810) = 5530
[pid 5527] clone( <unfinished ...>
[pid 5530] accept(3, NULL, NULLstrace: Process 5531 attached
<unfinished ...>
[pid 5527] <... clone resumed> child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD,
child_tidptr=0x7f29c9074810) = 5531
[pid 5527] clone( <unfinished ...>
[pid 5531] accept(3, NULL, NULLstrace: Process 5532 attached
<unfinished ...>
[pid 5527] <... clone resumed> child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD,
child_tidptr=0x7f29c9074810) = 5532
[pid 5527] clone( <unfinished ...>
[pid 5532] accept(3, NULL, NULLstrace: Process 5533 attached
<unfinished ...>
[pid 5527] <... clone resumed> child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD,
child_tidptr=0x7f29c9074810) = 5533
[pid 5527] clone( <unfinished ...>
[pid 5533] accept(3, NULL, NULLstrace: Process 5534 attached
<unfinished ...>
[pid 5527] <... clone resumed> child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD,
child_tidptr=0x7f29c9074810) = 5534
[pid 5527] clone( <unfinished ...>
[pid 5534] accept(3, NULL, NULLstrace: Process 5535 attached
<unfinished ...>
[pid 5527] <... clone resumed> child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD,
child_tidptr=0x7f29c9074810) = 5535
[pid 5527] clone( <unfinished ...>
[pid 5535] accept(3, NULL, NULL <unfinished ...>
[pid 5527] <... clone resumed> child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD,
child_tidptr=0x7f29c9074810) = 5536
strace: Process 5536 attached
[pid 5527] clone( <unfinished ...>
[pid 5536] accept(3, NULL, NULLstrace: Process 5537 attached
<unfinished ...>
[pid 5527] <... clone resumed> child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD,
child_tidptr=0x7f29c9074810) = 5537
[pid 5527] wait4(-1, <unfinished ...>
[pid 5537] accept(3, NULL, NULL
这⾥我们⾸先看到系统创建了⼗个进程。
下⾯这张图你会看出⼗个进程阻塞在accept这个系统调⽤上⾯:
接下来在另⼀个终端执⾏telnet 127.0.0.1 1234:
1 [pid 5528] <... accept resumed> ) = 4
2 [pid 5528] getpid() = 5528
3 [pid 5528] fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(0x88, 0x1), ...}) = 0
4 [pid 5528] brk(NULL) = 0x55e33c728000
5 [pid 5528] brk(0x55e33c749000) = 0x55e33c749000
6 [pid 5528] write(1, "process 5528 accept a connection"..., 49process 5528 accept a connection failed: Success
7 ) = 49
8 [pid 5528] close(4) = 0
9 [pid 5528] accept(3, NULL, NULL^C <unfinished ...>
10 [pid 5537] <... accept resumed> ) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
很明显当telnet连接的时候只有⼀个进程accept成功,你会不会和我有同样的疑问,就是会不会内核中唤醒了所有的进程只是没有获取到资源失败了,就好像惊群被“隐藏”?
在内核2.6及之后,解决了惊群,在内核中增加了⼀个互斥等待变量。
⼀个互斥等待的⾏为与睡眠基本类似,主要的不同点在于:
1)当⼀个等待队列⼊⼝有 WQ_FLAG_EXCLUSEVE 标志置位, 它被添加到等待队列的尾部. 没有这个标志的⼊⼝项, 相反, 添加到开始. 2)当 wake_up 被在⼀个等待队列上调⽤时, 它在唤醒第⼀个有 WQ_FLAG_EXCLUSIVE 标志的进程后停⽌。
对于互斥等待的⾏为,⽐如如对⼀个listen后的socket描述符,多线程阻塞accept时,系统内核只会唤醒所有正在等待此时间的队列的第⼀个,队列中的其他⼈则继续等待下⼀次事件的发⽣,这样就避免的多个线程同时监听同⼀个socket描述符时的惊群问题
下⾯分析⼀下内核源码,如何解决的?
1 我们要解决如下⼏个问题:
1:accept()函数的实现,包括从全队列中取出sock。
2:accept()函数如何如何被唤醒
3:accept()函数如何解决惊群
4:多个进程accept(),优先唤醒哪个进程
accept()函数的实现
accept()函数实现逻辑相对⽐较简单
如果没有完成建⽴的TCP会话,阻塞情况下,则阻塞,⾮阻塞情况下,则返回-EAGAIN。
所以总结来说需要考虑这么⼏种情况:
1、当前全队列中有socket,则accept()直接返回对应的fd。
2、如果当前全队列中没有socket,则如果当前socket是阻塞的,直接睡眠。
3、如果当前全队列中没有socket,如果⾮阻塞,就直接返回-EAGAIN。
4、如果是阻塞的listenfd,需要将当前进程挂在listenfd对应socket的等待队列⾥⾯,当前进程让出cpu,并且等待唤醒
sys_accept->sys_accept4->inet_accept->inet_csk_accept
其中 inet_csk_accept是核⼼处理逻辑,其处理了上述1、3两种情况
2 * This will accept the next outstanding connection.
3*/
4struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
5 {
6struct inet_connection_sock *icsk = inet_csk(sk);
7struct request_sock_queue *queue = &icsk->icsk_accept_queue;
8struct sock *newsk;
9struct request_sock *req;
10int error;
11
12 lock_sock(sk);
13
14/* We need to make sure that this socket is listening,
15 * and that it has something pending.
16*/
17
18//只有TCP_LISTEN状态的socket才能调⽤accept
19 error = -EINVAL;
20if (sk->sk_state != TCP_LISTEN)
21goto out_err;
22
23/* Find already established connection */
24
25//如果当前全队列中有已经三次握⼿建⽴起来后的连接,就不会进这个if,直接⾛到后⾯取全队列中的socket 26if (reqsk_queue_empty(queue)) {
27long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
28
29/* If this is a non blocking socket don't sleep */
30//⾮阻塞的socket,直接返回了
31 error = -EAGAIN;
32if (!timeo)
33goto out_err;
34
35//阻塞的socket,调⽤ inet_csk_wait_for_connect ,下⽂会说
36 error = inet_csk_wait_for_connect(sk, timeo);
37
38if (error)
39goto out_err;
40 }
41
42//⾛到这⾥,说明全队列中有socket,直接取出来
43 req = reqsk_queue_remove(queue);
44 newsk = req->sk;
45
46 sk_acceptq_removed(sk);
47if (sk->sk_protocol == IPPROTO_TCP && queue->fastopenq != NULL) {
48 spin_lock_bh(&queue->fastopenq->lock);
49if (tcp_rsk(req)->listener) {
50/* We are still waiting for the final ACK from 3WHS
51 * so can't free req now. Instead, we set req->sk to
52 * NULL to signify that the child socket is taken
53 * so reqsk_fastopen_remove() will free the req
54 * when 3WHS finishes (or is aborted).
55*/
56 req->sk = NULL;
57 req = NULL;
58 }
59 spin_unlock_bh(&queue->fastopenq->lock);
60 }
61out:
62 release_sock(sk);
63if (req)
64 __reqsk_free(req);
65return newsk;
66 out_err:
67 newsk = NULL;
68 req = NULL;
69 *err = error;
70goto out;
71 }
inet_csk_wait_for_connect函数处理了2、4两种情况
2 * Wait for an incoming connection, avoid race conditions. This must be called
3 * with the socket locked.
4*/
5static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
6 {
7struct inet_connection_sock *icsk = inet_csk(sk);
8 DEFINE_WAIT(wait);
9int err;
10
11/*
12 * True wake-one mechanism for incoming connections: only
13 * one process gets woken up, not the 'whole herd'.
14 * Since we do not 'race & poll' for established sockets
15 * anymore, the common case will execute the loop only once.
16 *
17 * Subtle issue: "add_wait_queue_exclusive()" will be added
18 * after any current non-exclusive waiters, and we know that
19 * it will always _stay_ after any new non-exclusive waiters
20 * because all non-exclusive waiters are added at the
21 * beginning of the wait-queue. As such, it's ok to "drop"
22 * our exclusiveness temporarily when we get woken up without
23 * having to remove and re-insert us on the wait queue.
24*/
25for (;;) {
26//prepare_to_wait_exclusive很重要,把 wait 挂到当前sk的等待队列⾥⾯。
27 prepare_to_wait_exclusive(sk_sleep(sk), &wait,
28 TASK_INTERRUPTIBLE);
29 release_sock(sk);
30//icsk_accept_queue是全队列
31if (reqsk_queue_empty(&icsk->icsk_accept_queue))
32 timeo = schedule_timeout(timeo);//阻塞情况下,只有主动唤醒当前进程,才会继续执⾏。
33 lock_sock(sk);
34 err = 0;
35
36//如果阻塞且⾮超时的情况从schedule_timeout返回,那么必然是全队列有值了。
37if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
38break;//这个break是所有程序必经之路
39 err = -EINVAL;
40if (sk->sk_state != TCP_LISTEN)
41break;
42 err = sock_intr_errno(timeo);
43
44//有信号或者睡眠时间满了,则退出循环,否则接着睡。
45if (signal_pending(current))
46break;
47 err = -EAGAIN;
48if (!timeo)
49break;
50 }
51 finish_wait(sk_sleep(sk), &wait);
52return err;
53 }
⾸先,为什么循环?这是历史原因,考虑有这么⼀种情况,就是睡眠时间没有睡满,那么 schedule_timeout返回的值⼤于0,那么什么情况下,睡眠没有睡满呢?⼀种情况就是进程收到信号,
另⼀种就是listenfd对应的socket的全队列有数据了,不考虑信号的情况,假设全队列有数据了,历史上,Linux的accept是惊群的,全队列有值后,所有进程都唤醒,那么必然存在某些进程读取到了全队列socket,⽽某些没有读取到,这些没有读取到的进程,肯定是睡眠没睡满,所以需要接着睡。
但是本⽂分析的Linux内核版本是3.10,全队列有数据时,只会唤醒⼀个进程,故⽽,次for循环只会跑⼀次
prepare_to_wait_exclusive函数很重要,把当前上下⽂加到listenfd对应的socket等待队列⾥⾯,如果是多进程,那么listenfd对应的socket等待队列⾥⾯会有
多个进程的上下⽂
多进程 accept 如何处理惊群
多进程accept,不考虑resuseport,那么多进程accept只会出现在⽗⼦进程同时accept的情况,那么上⽂也说
过,prepare_to_wait_exclusive函数会被当前进程
上下⽂加⼊到listenfd等待队列⾥⾯,所以⽗⼦进程的上下⽂都会加⼊到socket的等待队列⾥⾯。
核⼼问题就是这么唤醒,我们可以相当,所
谓的惊群,就是把>
等待队⾥⾥⾯的所有进程都唤醒。
我们此时来看看如何唤醒
1int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
2 {
3struct sock *rsk;
4
5 ......
6if (sk->sk_state == TCP_LISTEN) {
7struct sock *nsk = tcp_v4_hnd_req(sk, skb);
8if (!nsk)
9goto discard;
10
11if (nsk != sk) {
12 sock_rps_save_rxhash(nsk, skb);
13//当三次握⼿客户端的ack到来时,会⾛tcp_child_process这⾥
14if (tcp_child_process(sk, nsk, skb)) {
15 rsk = nsk;
16goto reset;
17 }
18return0;
19 }
20 }
21 ......
22 }
1int tcp_child_process(struct sock *parent, struct sock *child,
2struct sk_buff *skb)
3 {
4int ret = 0;
5int state = child->sk_state;
6
7if (!sock_owned_by_user(child)) {
8 ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),
9 skb->len);
10/* Wakeup parent, send SIGIO */
11if (state == TCP_SYN_RECV && child->sk_state != state)
12 parent->sk_data_ready(parent, 0);//唤醒在accept的进程,调⽤ sock_def_readable
13 } else {
14/* Alas, it is possible again, because we do lookup
15 * in main socket hash table and lock on listening
16 * socket does not protect us more.
17*/
18 __sk_add_backlog(child, skb);
19 }
20
21 bh_unlock_sock(child);
22 sock_put(child);
23return ret;
24 }
1static void sock_def_readable(struct sock *sk, int len)
2 {
3struct socket_wq *wq;
4
5 rcu_read_lock();
6 wq = rcu_dereference(sk->sk_wq);
7//显然,我们在accept的时候调⽤了`prepare_to_wait_exclusive`加⼊了队列,故唤醒靠 wake_up_interruptible_sync_poll 8if (wq_has_sleeper(wq))
9 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
10 POLLRDNORM | POLLRDBAND);
11 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
12 rcu_read_unlock();
13 }
1#define wake_up_interruptible_sync_poll(x, m) \
2 __wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, (void *) (m))
注意,__wake_up_sync_key的第三个参数是1
所以多个进程accept的时候,内核只会唤醒1个等待的进程,且唤醒的逻辑是FIFO
1static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
2int nr_exclusive, int wake_flags, void *key)
3 {
4 wait_queue_t *curr, *next;
5
6 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
7 unsigned flags = curr->flags;
8
9//prepare_to_wait_exclusive时候,flags是WQ_FLAG_EXCLUSIVE,⼊参nr_exclusive是1,所以只执⾏⼀次就break了。
10if (curr->func(curr, mode, wake_flags, key) &&
11 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
12break;
13 }
14 }
在内核2.6及之后,解决了惊群,在内核中增加了⼀个互斥等待变量。
⼀个互斥等待的⾏为与睡眠基本类似,主要的不同点在于:
1)当⼀个等待队列⼊⼝有WQ_FLAG_EXCLUSEVE标志置位, 它被添加到等待队列的尾部. 没有这个标志的⼊⼝项, 相反, 添加到开始. 2)当 wake_up 被在⼀个等待队列上调⽤时, 它在唤醒第⼀个有 WQ_FLAG_EXCLUSIVE 标志的进程后停⽌。
对于互斥等待的⾏为,⽐如如对⼀个listen后的socket描述符,多线程阻塞accept时,系统内核只会唤醒所有正在等待此时间的队列的第⼀个,队列中的其他⼈则继续等待下⼀次事件的发⽣,这样就避免的多个线程同时监听同⼀个socket描述符时的惊群问题。