嵌入式 wget 非递归下载

来源：互联网发布：伴读软件编辑：程序博客网时间：2024/05/18 12:35

1整体概括

上篇分析的是wget参数解析，本篇主要是分析wget非递归下载html或者文件。wget实际上就是通过sock 向web服务器发送http数据包(GET or POST)，web服务器收到请求后，发回回复包给wget，当然了http 传输层是tcp协议，简单来说wget 发tcp包，发送内容符合http协议，web服务器解析(such as nginx、apache)请求包，针对请求回复回复包。so easy.

整个wget可以用下面的流程图概括

其中config_analysis已经在前一篇已经分析过了，本篇就是分析wget后面的实现。

我们仅仅是不加任何参数的非递归下载，也就是执行如下命令：

wget www.baidu.com/index.html

2 代码详细解析

之前分析的是参数解析篇，参数解析完之后，会对参数进行校验。此段代码略过

nurl=argc –optind //参数解析完后，获取用户下载url个数，因为执行的命令是

wget www.baidu.com/index.html所以 nurl==1

[cpp] view plaincopy

//code1：
url = alloca_array (char *, nurl + 1);
for (i = 0; i < nurl; i++, optind++)
{
char *rewritten = rewrite_shorthand_url (argv[optind]);//为url增加http://
if (rewritten)
url[i] = rewritten;
else
url[i] = xstrdup (argv[optind]);
}

url[i] = NULL;//设置url最后元素为空，作为标记变量

上面那块代码主要是为url分配内存，分配nurl+1枚个元素的char*数组。

函数rewrite_shorthand_url (argv[optind]) 主要是为url添加http://字段，支持用户不输入协议，wget支持http、https、ftp协议，如果用户没输入协议，默认http://。并且url最后一个元素置为NULL，作为标志位。

[cpp] view plaincopy

//code2部分代码：
for (t = url; *t; t++)
{
char *filename = NULL, *redirected_URL = NULL;
int dt, url_err;
/* Need to do a new struct iri every time, because
* retrieve_url may modify it in some circumstances,
* currently. */
struct iri *iri = iri_new ();
struct url *url_parsed;
set_uri_encoding (iri, opt.locale, true);
/*对url进行解析*/
url_parsed = url_parse (*t, &url_err, iri, true);
if (!url_parsed)
{
char *error = url_error (*t, url_err);
logprintf (LOG_NOTQUIET, "%s: %s.\n",*t, error);
xfree (error);
inform_exit_status (URLERROR);
}
else
{
/*如果是递归or需要页面css js之类的，并且不是ftp协议*/
if ((opt.recursive || opt.page_requisites)
&& (url_scheme (*t) != SCHEME_FTP || url_uses_proxy (url_parsed)))
/*此case为递归url下载*/
{
int old_follow_ftp = opt.follow_ftp;
/* Turn opt.follow_ftp on in case of recursive FTP retrieval */
if (url_scheme (*t) == SCHEME_FTP)
opt.follow_ftp = 1;
retrieve_tree (url_parsed, NULL);
opt.follow_ftp = old_follow_ftp;
}
else
{
/*此处为非递归url下载*/
retrieve_url (url_parsed, *t, &filename, &redirected_URL, NULL,
&dt, opt.recursive, iri, true);
}

[cpp] view plaincopy

[cpp] view plaincopy

代码遍历url，对用户的每一个url都进行下载。

函数set_uri_encoding主要是对url进行解析一个正常的url是如下格式：

scheme://host[:port][/path][;params][?query][#fragment],此函数就是对url解析出来每一个结构。如下：

[cpp] view plaincopy

url : http://www.baidu.com/index.html
scheme: SCHEME_HTTP
host: www.baidu.com
port: 80
path: index.html
params: NULL
query: NULL
fragment: NULL
file: index.html
user: NULL
passwd: NULL

同时会对url进行utf-8编码。

会根据用户参数来决定是递归下载or非递归下载

递归下载条件：(用户输入-r or –p) && (not ftp协议 or use_proxy)

因为我们是直接下载，所以会跳到

retrieve_url (url_parsed, *t,&filename, &redirected_URL, NULL,

&dt, opt.recursive, iri, true);

2.1 retrieve_url

//如果使用proxy会设置一些属性，因为没有用proxy所以跳过了。

2.2 httploop

result = http_loop (u, orig_parsed,&mynewloc, &local_file, refurl, dt, proxy_url, iri);

参数说明：

u和orig_parsed是属性是相同值

mynewloc 指向NULL。

local_file 指向NULL。

refurl指向NULL。

dt 为 -1。

proxy_url 指向NULL。

iri为上层分析的那个iri，包括编码方式。

[cpp] view plaincopy

//code
hstat.referer = referer;//设置referer，此时的referer为NULL
//保存文件名称首先是通过 --output-document 如果没有就获取url后缀名称
if (opt.output_document)
{
hstat.local_file = xstrdup (opt.output_document);
got_name = true;
}
else if (!opt.content_disposition)
{
hstat.local_file = url_file_name (opt.trustservernames ? u : original_url, NULL);
/*此函数主要是如果u->file如果存在，会生成一个新的文件名file_1…如果是设置
了clobber就会覆盖*/
got_name = true;
}

2.3 gethttp

[cpp] view plaincopy

req = request_new ();//构造一个req头
static struct request *
request_new (void)
{
struct request *req = xnew0 (struct request);//分配request结构
req->hcapacity = 8;//初始化http头部数组为8个
req->headers = xnew_array (struct request_header, req->hcapacity);//分配
return req;
}

下面是请求结构

[cpp] view plaincopy

struct request {
const char *method;//请求方法
char *arg; //请求内容
/*此结构保存http header的key和value，比如content-length:xxxx
Key为content-length
Value为xxx
*/
struct request_header {
char *name, *value;
enum rp release_policy;
} *headers;
int hcount;
int hcapacity; //此头部容量
};

设置http方法

[cpp] view plaincopy

request_set_method（req, meth, meth_arg）
{
req->method = meth;
req->arg = arg;
}

设置http header

[cpp] view plaincopy

static void
request_set_header (struct request *req, char *name, char *value,
enum rp release_policy)
{
struct request_header *hdr;
int i;
if (!value)
{
/* A NULL value is a no-op; if freeing the name is requested,
free it now to avoid leaks. */
if (release_policy == rel_name || release_policy == rel_both)
xfree (name);
return;
}
//首先是遍历所有头部，如果说找到的话，就释放设置成新的头
for (i = 0; i < req->hcount; i++)
{
hdr = &req->headers[i];
if (0 == strcasecmp (name, hdr->name))
{
/* Replace existing header. */
release_header (hdr);
hdr->name = name;
hdr->value = value;
hdr->release_policy = release_policy;
return;
}
}
//如果用户设置的头很多，超过了8个就重新分配 2的幂增长
if (req->hcount >= req->hcapacity)
{
req->hcapacity <<= 1;
req->headers = xrealloc (req->headers, req->hcapacity * sizeof (*hdr));
}
hdr = &req->headers[req->hcount++];
hdr->name = name;
hdr->value = value;
hdr->release_policy = release_policy;
}

后面设置头都调用request_set_header这个函数

连接服务器：

Sock = connect_to_host (conn->host,conn->port)

如果是host为ip地址，那么就直接连接，如果不是，首先查找dns cache

Dns_cachehash table(如果给出的是host，就得获得host的ip)

此hash 是通过算法把host字符串算成一个int 的key，然后再求余算索引，然后hash处理冲突才用开放定址法。

创建key算法(key为host，算出来的结果为hash的key

[cpp] view plaincopy

static unsigned long
hash_string_nocase (const void *key)
{
const char *p = key;
unsigned int h = c_tolower (*p);
if (h)
for (p += 1; *p != '\0'; p++)
h = (h << 5) - h + c_tolower (*p);
return h;
}

查找hash key 算法是开放定址法，这里就不说了。

Hash表的value为structaddress_list *al

[cpp] view plaincopy

struct address_list {
int count; /* number of adrresses */
ip_address *addresses; /* pointer to the string of addresses */
int faulty; /* number of addresses known not to work. */
bool connected; /* whether we were able to connect to
one of the addresses in the list,
at least once. */
int refcount; /* reference count; when it drops to
0, the entry is freed. */
};
typedef struct {
/* Address family, one of AF_INET or AF_INET6. */
int family;
/* The actual data, in the form of struct in_addr or in6_addr: */
union {
struct in_addr d4; /* IPv4 address */
#ifdef ENABLE_IPV6
struct in6_addr d6; /* IPv6 address */
#endif
} data;
/* Under IPv6 getaddrinfo also returns scope_id. Since it's
Pv6-specific it strictly belongs in the above union, but we put
it here for simplicity. */
#if defined ENABLE_IPV6 && defined HAVE_SOCKADDR_IN6_SCOPE_ID
int ipv6_scope;
#endif
} ip_address;

以下是hash表提供的接口：

[cpp] view plaincopy

cache_query(host) //search
cache_remove(host) //delete
cache_store(host,val) //insert

如果在dns hash table中找不到，就调用gethostbyname api来获取host的ip，对每一个ip port进行connect，直到连接成功为止。连接成功host后，把req组包发送出去(request_send)

读取回复头：

Head = read_http_response_head(sock)

此时使用了select做为事件超时和MSG_PEEK预先读取内核socket read buffer数据，但是数据不删除，直到找到\r\n\r\n(fd_peek)，然后进行实际读取(fd_read)

New 回复数据包:

resp = resp_new (head);

解析数据包

读取body部分:

hs->res = fd_read_body (sock, fp,contlen != -1 ? contlen : 0,

hs->restval, &hs->rd_size, &hs->len,&hs->dltime,flags);

0 0