Linux用户空间线程管理介绍之二：创建线程堆栈

来源：互联网发布：工业以太网知乎编辑：程序博客网时间：2024/06/06 00:36

转自：http://www.longene.org/forum/viewtopic.php?f=17&t=429&sid=babec6ba82dd65e29c5fafe03e4d89c0

前面已经介绍过了线程结构pthread，下面就需要来看看在创建线程过程中，如何生成这个结构的。Allocate_stack函数位于nptl/allocatestack.c中：

代码: 全选: 308 static int 309 allocate_stack (const struct pthread_attr *attr, struct pthread **pdp, 310 ALLOCATE_STACK_PARMS) 311 { 312 struct pthread *pd; 313 size_t size; 314 size_t pagesize_m1 = __getpagesize () - 1; 315 void *stacktop; 316 317 assert (attr != NULL); 318 assert (powerof2 (pagesize_m1 + 1)); 319 assert (TCB_ALIGNMENT >= STACK_ALIGN); 320 321 /* Get the stack size from the attribute if it is set. Otherwise we 322 use the default we determined at start time. */ 323 size = attr->stacksize ?: __default_stacksize; 324 325 /* Get memory for the stack. */ 326 if (__builtin_expect (attr->flags & ATTR_FLAG_STACKADDR, 0)) 327 { ......... 410 } 411 else 412 {

用户程序在调用pthread_create时，可以传进一个参数pthread_attr，这个参数可以指定堆栈地址、大小等参数，323行的意思就是说在指定堆栈大小的情况下，采用指定大小，否则采用默认大小。__default_stacksize可以有ulimit -s查看，在一般系统中，这个值为8M。在通常情况下，应用程序是不指定堆栈大小的。

326行开始，分两种情况处理堆栈地址是否由pthread_attr中指定，在通常情况下，这个地址也是不指定的，因此，直接看412行开始的else部分：

代码: 全选: 412 { 413 /* Allocate some anonymous memory. If possible use the cache. */ 414 size_t guardsize; 415 size_t reqsize; 416 void *mem; 417 const int prot = (PROT_READ | PROT_WRITE 418 | ((GL(dl_stack_flags) & PF_X) ? PROT_EXEC : 0)); 419 420 #if COLORING_INCREMENT != 0 421 /* Add one more page for stack coloring. Don't do it for stacks 422 with 16 times pagesize or larger. This might just cause 423 unnecessary misalignment. */ 424 if (size <= 16 * pagesize_m1) 425 size += pagesize_m1 + 1; 426 #endif 427 428 /* Adjust the stack size for alignment. */ 429 size &= ~__static_tls_align_m1; 430 assert (size != 0); 431 432 /* Make sure the size of the stack is enough for the guard and 433 eventually the thread descriptor. */ 434 guardsize = (attr->guardsize + pagesize_m1) & ~pagesize_m1; 435 if (__builtin_expect (size < ((guardsize + __static_tls_size 436 + MINIMAL_REST_STACK + pagesize_m1) 437 & ~pagesize_m1), 438 0)) 439 /* The stack is too small (or the guard too large). */ 440 return EINVAL; 441 442 /* Try to get a stack from the cache. */ 443 reqsize = size; 444 pd = get_cached_stack (&size, &mem); 445 if (pd == NULL) 446 { 447 /* To avoid aliasing effects on a larger scale than pages we 448 adjust the allocated stack size if necessary. This way 449 allocations directly following each other will not have 450 aliasing problems. */ 451 #if MULTI_PAGE_ALIASING != 0 452 if ((size % MULTI_PAGE_ALIASING) == 0) 453 size += pagesize_m1 + 1; 454 #endif 455 456 mem = mmap (NULL, size, prot, 457 MAP_PRIVATE | MAP_ANONYMOUS | ARCH_MAP_FLAGS, -1, 0); 458 459 if (__builtin_expect (mem == MAP_FAILED, 0)) 460 { 461 #ifdef ARCH_RETRY_MMAP 462 mem = ARCH_RETRY_MMAP (size); 463 if (__builtin_expect (mem == MAP_FAILED, 0)) 464 #endif 465 return errno; 466 } 467 468 /* SIZE is guaranteed to be greater than zero. 469 So we can never get a null pointer back from mmap. */ 470 assert (mem != NULL); 471 472 #if COLORING_INCREMENT != 0 473 /* Atomically increment NCREATED. */ 474 unsigned int ncreated = atomic_increment_val (&nptl_ncreated); 475 476 /* We chose the offset for coloring by incrementing it for 477 every new thread by a fixed amount. The offset used 478 module the page size. Even if coloring would be better 479 relative to higher alignment values it makes no sense to 480 do it since the mmap() interface does not allow us to 481 specify any alignment for the returned memory block. */ 482 size_t coloring = (ncreated * COLORING_INCREMENT) & pagesize_m1; 483 484 /* Make sure the coloring offsets does not disturb the alignment 485 of the TCB and static TLS block. */ 486 if (__builtin_expect ((coloring & __static_tls_align_m1) != 0, 0)) 487 coloring = (((coloring + __static_tls_align_m1) 488 & ~(__static_tls_align_m1)) 489 & ~pagesize_m1); 490 #else 491 /* Unless specified we do not make any adjustments. */ 492 # define coloring 0 493 #endif

417行是设定堆栈段的权限，在某些情况下，堆栈段内可有存放一些临时的代码，这样就需要有可执行权限，一般情况下，是可读写的权限。
设定完堆栈段的权限后，就开始处理堆栈段的大小，主要是一些堆栈大小、对齐的检查，还有Guard页的检查。
在进行堆栈的映射之前，还需要通过get_cached_stack函数，检查系统中是否存在缓冲着的堆栈，在我们的情景中，我们假定是第一次创建线程，就不存在缓冲的堆栈，这个函数留待后面介绍。
这些检查都完成后，就需要通过mmap来映射堆栈了。这是一个匿名映射，相当于在用户空间直接分配内存，有点像brk系统调用，用来分配大块的内存。
后面一段是对coloring的设定，暂时认为这段代码没有起作用吧。

这些完成后，就开始ptherad结构的设定了：

代码: 全选: 494 495 /* Place the thread descriptor at the end of the stack. */ 496 #if TLS_TCB_AT_TP 497 pd = (struct pthread *) ((char *) mem + size - coloring) - 1; 498 #elif TLS_DTV_AT_TP 499 pd = (struct pthread *) ((((uintptr_t) mem + size - coloring 500 - __static_tls_size) 501 & ~__static_tls_align_m1) 502 - TLS_PRE_TCB_SIZE); 503 #endif 504 505 /* Remember the stack-related values. */ 506 pd->stackblock = mem; 507 pd->stackblock_size = size; 508 509 /* We allocated the first block thread-specific data array. 510 This address will not change for the lifetime of this 511 descriptor. */ 512 pd->specific[0] = pd->specific_1stblock; 513 514 /* This is at least the second thread. */ 515 pd->header.multiple_threads = 1; 516 #ifndef TLS_MULTIPLE_THREADS_IN_TCB 517 __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1; 518 #endif 519 520 #ifndef __ASSUME_PRIVATE_FUTEX 521 /* The thread must know when private futexes are supported. */ 522 pd->header.private_futex = THREAD_GETMEM (THREAD_SELF, 523 header.private_futex); 524 #endif 525 526 #ifdef NEED_DL_SYSINFO 527 /* Copy the sysinfo value from the parent. */ 528 THREAD_SYSINFO(pd) = THREAD_SELF_SYSINFO; 529 #endif 530 531 /* The process ID is also the same as that of the caller. */ 532 pd->pid = THREAD_GETMEM (THREAD_SELF, pid); 533 534 /* Allocate the DTV for this thread. */ 535 if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL) 536 { 537 /* Something went wrong. */ 538 assert (errno == ENOMEM); 539 540 /* Free the stack memory we just allocated. */ 541 (void) munmap (mem, size); 542 543 return EAGAIN; 544 } 545 546 547 /* Prepare to modify global data. */ 548 lll_lock (stack_cache_lock, LLL_PRIVATE); 549 550 /* And add to the list of stacks in use. */ 551 list_add (&pd->list, &stack_used); 552 553 lll_unlock (stack_cache_lock, LLL_PRIVATE); 554 555 556 /* There might have been a race. Another thread might have 557 caused the stacks to get exec permission while this new 558 stack was prepared. Detect if this was possible and 559 change the permission if necessary. */ 560 if (__builtin_expect ((GL(dl_stack_flags) & PF_X) != 0 561 && (prot & PROT_EXEC) == 0, 0)) 562 { 563 int err = change_stack_perm (pd 564 #ifdef NEED_SEPARATE_REGISTER_STACK 565 , ~pagesize_m1 566 #endif 567 ); 568 if (err != 0) 569 { 570 /* Free the stack memory we just allocated. */ 571 (void) munmap (mem, size); 572 573 return err; 574 } 575 } 576 577 578 /* Note that all of the stack and the thread descriptor is 579 zeroed. This means we do not have to initialize fields 580 with initial value zero. This is specifically true for 581 the 'tid' field which is always set back to zero once the 582 stack is not used anymore and for the 'guardsize' field 583 which will be read next. */ 584 }

前面说到过，在我所观察的系统中，TLS_TCB_AT_TP总是被定义，这意味着pthread位于刚才申请的堆栈的顶端，见497行，这里的-1，是减去一个pthread结构的大小。
505~532行，开始设置新线程的pthread结构，堆栈信息设置为刚刚申请的堆栈，并将pthread结构设置成为多线程状态，futex、sysinfo、pid等则从父线程继承。

接下来是调用_dl_allocate_tls来设置TLS，这是一个相当重要的过程，如果不能设置TLS，程序很有可能无法运行，目前兼容内核中多线程问题很多时候与此相关。
先看宏TLS_TPADJ，它就定义在nptl/allocatestack.c中
#define TLS_TPADJ (pd) (pd)
也就是pthread结构本身。再看_dl_allocate_tls()，位于elf/dl-tls.c中：

代码: 全选: 459 void * 460 internal_function 461 _dl_allocate_tls (void *mem) 462 { 463 return _dl_allocate_tls_init (mem == NULL 464 ? _dl_allocate_tls_storage () 465 : allocate_dtv (mem)); 466 } 在我们这个情景中，传进去的mem值为pthread结构地址，不为NULL，因此调用到了allocate_dtv函数，也是位于elf/dl-tls.c中： 289 static void * 290 internal_function 291 allocate_dtv (void *result) 292 { 293 dtv_t *dtv; 294 size_t dtv_length; 295 296 /* We allocate a few more elements in the dtv than are needed for the 297 initial set of modules. This should avoid in most cases expansions 298 of the dtv. */ 299 dtv_length = GL(dl_tls_max_dtv_idx) + DTV_SURPLUS; 300 dtv = calloc (dtv_length + 2, sizeof (dtv_t)); 301 if (dtv != NULL) 302 { 303 /* This is the initial length of the dtv. */ 304 dtv[0].counter = dtv_length; 305 306 /* The rest of the dtv (including the generation counter) is 307 Initialize with zero to indicate nothing there. */ 308 309 /* Add the dtv to the thread data structures. */ 310 INSTALL_DTV (result, dtv); 311 } 312 else 313 result = NULL; 314 315 return result; 316 }

这个函数比较简单，就是申请一个dtv的数组，然后装载到pthread结构中，有趣的是INSTALL_DTV这个宏，定义在nptl/sysdep/i386/tls.h中：

代码: 全选: # define INSTALL_DTV(descr, dtvp) \ ((tcbhead_t *) (descr))->dtv = (dtvp) + 1

它没有直接把申请到的dtv数组的首地址装载到pthread结构中，而是将其第二个元素的地址装入，或许它的意思是，第一个元素师存放dtv_length，没有必要看到吧。
Allocate_dtv完成后，需要调用_dl_allocate_tls_init对TLS进行初始化：

代码: 全选: 377 void * 378 internal_function 379 _dl_allocate_tls_init (void *result) 380 { 381 if (result == NULL) 382 /* The memory allocation failed. */ 383 return NULL; 384 385 dtv_t *dtv = GET_DTV (result); 386 struct dtv_slotinfo_list *listp; 387 size_t total = 0; 388 size_t maxgen = 0; 389 390 /* We have to prepare the dtv for all currently loaded modules using 391 TLS. For those which are dynamically loaded we add the values 392 indicating deferred allocation. */ 393 listp = GL(dl_tls_dtv_slotinfo_list); 394 while (1) 395 { 396 size_t cnt; 397 398 for (cnt = total == 0 ? 1 : 0; cnt < listp->len; ++cnt) 399 { 400 struct link_map *map; 401 void *dest; 402 403 /* Check for the total number of used slots. */ 404 if (total + cnt > GL(dl_tls_max_dtv_idx)) 405 break; 406 407 map = listp->slotinfo[cnt].map; 408 if (map == NULL) 409 /* Unused entry. */ 410 continue; 411 412 /* Keep track of the maximum generation number. This might 413 not be the generation counter. */ 414 maxgen = MAX (maxgen, listp->slotinfo[cnt].gen); 415 416 if (map->l_tls_offset == NO_TLS_OFFSET) 417 { 418 /* For dynamically loaded modules we simply store 419 the value indicating deferred allocation. */ 420 dtv[map->l_tls_modid].pointer.val = TLS_DTV_UNALLOCATED; 421 dtv[map->l_tls_modid].pointer.is_static = false; 422 continue; 423 } 424 425 assert (map->l_tls_modid == cnt); 426 assert (map->l_tls_blocksize >= map->l_tls_initimage_size); 427 #if TLS_TCB_AT_TP 428 assert ((size_t) map->l_tls_offset >= map->l_tls_blocksize); 429 dest = (char *) result - map->l_tls_offset; 430 #elif TLS_DTV_AT_TP 431 dest = (char *) result + map->l_tls_offset; 432 #else 433 # error "Either TLS_TCB_AT_TP or TLS_DTV_AT_TP must be defined" 434 #endif 435 436 /* Copy the initialization image and clear the BSS part. */ 437 dtv[map->l_tls_modid].pointer.val = dest; 438 dtv[map->l_tls_modid].pointer.is_static = true; 439 memset (__mempcpy (dest, map->l_tls_initimage, 440 map->l_tls_initimage_size), '\0', 441 map->l_tls_blocksize - map->l_tls_initimage_size); 442 } 443 444 total += cnt; 445 if (total >= GL(dl_tls_max_dtv_idx)) 446 break; 447 448 listp = listp->next; 449 assert (listp != NULL); 450 } 451 452 /* The DTV version is up-to-date now. */ 453 dtv[0].counter = maxgen; 454 455 return result; 456 }

这里是一大堆和连接有关的代码，这里就不做解释了，如果以后有时间，或许可以多看看连接相关的代码，梳理一下，连接过程到底是如何完成的。
回到allocate_stack函数中，551行是将此结构连接到stack_used队列中。当进程退出时，将调用到_deallocate_stack，此时，此结构将从stack_used队列脱出，加入到stack_cached中，等待下一个pthread_create调用。
接下来一段和可执行堆栈相关，不是我们所关心的，忽略。

代码: 全选: 585 586 /* Create or resize the guard area if necessary. */ 587 if (__builtin_expect (guardsize > pd->guardsize, 0)) 588 { 589 #ifdef NEED_SEPARATE_REGISTER_STACK 590 char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1); 591 #elif _STACK_GROWS_DOWN 592 char *guard = mem; 593 # elif _STACK_GROWS_UP 594 char *guard = (char *) (((uintptr_t) pd - guardsize) & ~pagesize_m1); 595 #endif 596 if (mprotect (guard, guardsize, PROT_NONE) != 0) 597 { 598 int err; 599 mprot_error: 600 err = errno; 601 602 lll_lock (stack_cache_lock, LLL_PRIVATE); 603 604 /* Remove the thread from the list. */ 605 list_del (&pd->list); 606 607 lll_unlock (stack_cache_lock, LLL_PRIVATE); 608 609 /* Get rid of the TLS block we allocated. */ 610 _dl_deallocate_tls (TLS_TPADJ (pd), false); 611 612 /* Free the stack memory regardless of whether the size 613 of the cache is over the limit or not. If this piece 614 of memory caused problems we better do not use it 615 anymore. Uh, and we ignore possible errors. There 616 is nothing we could do. */ 617 (void) munmap (mem, size); 618 619 return err; 620 } 621 622 pd->guardsize = guardsize; 623 } 624 else if (__builtin_expect (pd->guardsize - guardsize > size - reqsize, 625 0)) 626 { 627 /* The old guard area is too large. */ 628 629 #ifdef NEED_SEPARATE_REGISTER_STACK 630 char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1); 631 char *oldguard = mem + (((size - pd->guardsize) / 2) & ~pagesize_m1); 632 633 if (oldguard < guard 634 && mprotect (oldguard, guard - oldguard, prot) != 0) 635 goto mprot_error; 636 637 if (mprotect (guard + guardsize, 638 oldguard + pd->guardsize - guard - guardsize, 639 prot) != 0) 640 goto mprot_error; 641 #elif _STACK_GROWS_DOWN 642 if (mprotect ((char *) mem + guardsize, pd->guardsize - guardsize, 643 prot) != 0) 644 goto mprot_error; 645 #elif _STACK_GROWS_UP 646 if (mprotect ((char *) pd - pd->guardsize, 647 pd->guardsize - guardsize, prot) != 0) 648 goto mprot_error; 649 #endif 650 651 pd->guardsize = guardsize; 652 }

上面这么一大段，是为了设置Guard页，总体说来就是把刚才申请到的内存最低几页，设置成为PROT_NONE，使着一页无法访问。
再下面就是锁、mutex等一些同步用的字段设置。这样在新线程创建出来之前，pthread结构的设置工作就基本完成了。

0 0