wget 非递归下载

来源：互联网发布：淘宝网闲鱼在哪里编辑：程序博客网时间：2024/05/04 13:26

wget 非递归下载篇

1整体概括

上篇分析的是wget参数解析，本篇主要是分析wget非递归下载html或者文件。wget实际上就是通过sock 向web服务器发送http数据包(GET or POST)，web服务器收到请求后，发回回复包给wget，当然了http 传输层是tcp协议，简单来说wget 发tcp包，发送内容符合http协议，web服务器解析(such as nginx、apache)请求包，针对请求回复回复包。so easy.

整个wget可以用下面的流程图概括

其中config_analysis已经在前一篇已经分析过了，本篇就是分析wget后面的实现。

我们仅仅是不加任何参数的非递归下载，也就是执行如下命令：

wget www.baidu.com/index.html

2 代码详细解析

之前分析的是参数解析篇，参数解析完之后，会对参数进行校验。此段代码略过

nurl=argc –optind //参数解析完后，获取用户下载url个数，因为执行的命令是

wget www.baidu.com/index.html所以 nurl==1

//code1：url = alloca_array (char *, nurl + 1);for (i = 0; i < nurl; i++, optind++){char *rewritten = rewrite_shorthand_url (argv[optind]);//为url增加http://if (rewritten)url[i] = rewritten;elseurl[i] = xstrdup (argv[optind]);}

url[i] = NULL;//设置url最后元素为空，作为标记变量

上面那块代码主要是为url分配内存，分配nurl+1枚个元素的char*数组。

函数rewrite_shorthand_url (argv[optind]) 主要是为url添加http://字段，支持用户不输入协议，wget支持http、https、ftp协议，如果用户没输入协议，默认http://。并且url最后一个元素置为NULL，作为标志位。

//code2部分代码：for (t = url; *t; t++){char *filename = NULL, *redirected_URL = NULL;int dt, url_err;/* Need to do a new struct iri every time, because* retrieve_url may modify it in some circumstances,* currently. */struct iri *iri = iri_new ();struct url *url_parsed;set_uri_encoding (iri, opt.locale, true);/*对url进行解析*/url_parsed = url_parse (*t, &url_err, iri, true); if (!url_parsed){char *error = url_error (*t, url_err);logprintf (LOG_NOTQUIET, "%s: %s.\n",*t, error);xfree (error);inform_exit_status (URLERROR);}else{/*如果是递归or需要页面css js之类的，并且不是ftp协议*/if ((opt.recursive || opt.page_requisites)&& (url_scheme (*t) != SCHEME_FTP || url_uses_proxy (url_parsed)))/*此case为递归url下载*/{int old_follow_ftp = opt.follow_ftp;/* Turn opt.follow_ftp on in case of recursive FTP retrieval */if (url_scheme (*t) == SCHEME_FTP)opt.follow_ftp = 1;retrieve_tree (url_parsed, NULL);opt.follow_ftp = old_follow_ftp;                                                                                                                    }else{/*此处为非递归url下载*/retrieve_url (url_parsed, *t, &filename, &redirected_URL, NULL,&dt, opt.recursive, iri, true);}

代码遍历url，对用户的每一个url都进行下载。

函数set_uri_encoding主要是对url进行解析一个正常的url是如下格式：

scheme://host[:port][/path][;params][?query][#fragment],此函数就是对url解析出来每一个结构。如下：

url :http://www.baidu.com/index.htmlscheme:SCHEME_HTTPhost:www.baidu.com port:80path:index.htmlparams:NULLquery:NULLfragment:NULLfile:index.htmluser:NULLpasswd:NULL

同时会对url进行utf-8编码。

会根据用户参数来决定是递归下载or非递归下载

递归下载条件：(用户输入-r or –p) && (not ftp协议 or use_proxy)

因为我们是直接下载，所以会跳到

retrieve_url (url_parsed, *t,&filename, &redirected_URL, NULL,

&dt, opt.recursive, iri, true);

2.1 retrieve_url

//如果使用proxy会设置一些属性，因为没有用proxy所以跳过了。

2.2 httploop

result = http_loop (u, orig_parsed,&mynewloc, &local_file, refurl, dt, proxy_url, iri);

参数说明：

u和orig_parsed是属性是相同值

mynewloc 指向NULL。

local_file 指向NULL。

refurl指向NULL。

dt 为 -1。

proxy_url 指向NULL。

iri为上层分析的那个iri，包括编码方式。

//codehstat.referer = referer;//设置referer，此时的referer为NULL//保存文件名称 首先是通过 --output-document 如果没有就获取url后缀名称if (opt.output_document){hstat.local_file = xstrdup (opt.output_document);got_name = true;}else if (!opt.content_disposition){hstat.local_file = url_file_name (opt.trustservernames ? u : original_url, NULL);/*此函数主要是如果u->file如果存在，会生成一个新的文件名file_1…如果是设置了clobber就会覆盖*/got_name = true;}

2.3 gethttp

req = request_new ();//构造一个req头static struct request *request_new (void){struct request *req = xnew0 (struct request);//分配request结构req->hcapacity = 8;//初始化http头部数组为8个req->headers = xnew_array (struct request_header, req->hcapacity);//分配                                                                                              return req; }

下面是请求结构

struct request {                                                                                                                                                  const char *method;//请求方法char *arg; //请求内容/*此结构保存http header的key和value，比如content-length:xxxxKey为content-lengthValue为xxx*/struct request_header {char *name, *value;enum rp release_policy;} *headers;int hcount;int hcapacity;//此头部容量};

设置http方法

request_set_method（req, meth, meth_arg）{req->method = meth;req->arg = arg;}

设置http header

static voidrequest_set_header (struct request *req, char *name, char *value,                     enum rp release_policy){struct request_header *hdr;int i;if (!value)                                                                                                                                                     {/* A NULL value is a no-op; if freeing the name is requested,free it now to avoid leaks.  */if (release_policy == rel_name || release_policy == rel_both)xfree (name);return;}//首先是遍历所有头部，如果说找到的话，就释放设置成新的头for (i = 0; i < req->hcount; i++){hdr = &req->headers[i];if (0 == strcasecmp (name, hdr->name)){/* Replace existing header. */release_header (hdr);hdr->name = name;hdr->value = value;hdr->release_policy = release_policy;return;}}//如果用户设置的头很多，超过了8个就重新分配 2的幂增长if (req->hcount >= req->hcapacity) {req->hcapacity <<= 1;req->headers = xrealloc (req->headers, req->hcapacity * sizeof (*hdr));}hdr = &req->headers[req->hcount++];hdr->name = name;hdr->value = value;hdr->release_policy = release_policy;}

后面设置头都调用request_set_header这个函数

连接服务器：

Sock = connect_to_host (conn->host,conn->port)

如果是host为ip地址，那么就直接连接，如果不是，首先查找dns cache

Dns_cachehash table(如果给出的是host，就得获得host的ip)

此hash 是通过算法把host字符串算成一个int 的key，然后再求余算索引，然后hash处理冲突才用开放定址法。

创建key算法(key为host，算出来的结果为hash的key

static unsigned longhash_string_nocase (const void *key)                                                                                                                               {const char *p = key;unsigned int h = c_tolower (*p);if (h)for (p += 1; *p != '\0'; p++)h = (h << 5) - h + c_tolower (*p);return h; }

查找hash key 算法是开放定址法，这里就不说了。

Hash表的value为structaddress_list *al

struct address_list {int count;                    /* number of adrresses */ip_address *addresses;        /* pointer to the string of addresses */                                                                                           int faulty;                   /* number of addresses known not to work. */bool connected;               /* whether we were able to connect to                                one of the addresses in the list,                                 at least once. */int refcount;                 /* reference count; when it drops to                                   0, the entry is freed. */};typedef struct {/* Address family, one of AF_INET or AF_INET6. */int family;/* The actual data, in the form of struct in_addr or in6_addr: */union {struct in_addr d4;      /* IPv4 address */#ifdef ENABLE_IPV6struct in6_addr d6;     /* IPv6 address */#endif} data;/* Under IPv6 getaddrinfo also returns scope_id.  Since it'sPv6-specific it strictly belongs in the above union, but we putit here for simplicity.  */#if defined ENABLE_IPV6 && defined HAVE_SOCKADDR_IN6_SCOPE_IDint ipv6_scope;#endif} ip_address;

以下是hash表提供的接口：

cache_query(host)//searchcache_remove(host)//deletecache_store(host,val)//insert

如果在dns hash table中找不到，就调用gethostbyname api来获取host的ip，对每一个ip port进行connect，直到连接成功为止。连接成功host后，把req组包发送出去(request_send)

读取回复头：

Head = read_http_response_head(sock)

此时使用了select做为事件超时和MSG_PEEK预先读取内核socket read buffer数据，但是数据不删除，直到找到\r\n\r\n(fd_peek)，然后进行实际读取(fd_read)

New 回复数据包:

resp = resp_new (head);

解析数据包

读取body部分:

hs->res = fd_read_body (sock, fp,contlen != -1 ? contlen : 0,

hs->restval, &hs->rd_size, &hs->len,&hs->dltime,flags);

Leek in beijing

0 0