【GoLang笔记】A Tour of Go - Exercise: Web Crawler

来源：互联网发布：海贼王883知乎编辑：程序博客网时间：2024/05/21 19:31

本文是GoLang学习教程中的一道习题，具体题目要求及代码实现如下。

备注：由于天朝GFW屏蔽了GAE，所以GoLang官网及学习教程需要翻墙才能访问。

Exercise: Web Crawler

In this exercise you'll use Go's concurrency features to parallelize a web crawler.
Modify the Crawl function to fetch URLs in parallel without fetching the same URL twice.

题目需要实现一个不会重复抓取已抓页面的并发爬虫。

下面是需要修改的原题代码：

package mainimport (    "fmt")type Fetcher interface {    // Fetch returns the body of URL and    // a slice of URLs found on that page.    Fetch(url string) (body string, urls []string, err error)}// Crawl uses fetcher to recursively crawl// pages starting with url, to a maximum of depth.func Crawl(url string, depth int, fetcher Fetcher) {    // TODO: Fetch URLs in parallel.    // TODO: Don't fetch the same URL twice.    // This implementation doesn't do either:    if depth <= 0 {    return    }    body, urls, err := fetcher.Fetch(url)    if err != nil {    fmt.Println(err)    return    }    fmt.Printf("found: %s %q\n", url, body)    for _, u := range urls {    Crawl(u, depth-1, fetcher)    }    return}func main() {    Crawl("http://golang.org/", 4, fetcher)}// fakeFetcher is Fetcher that returns canned results.type fakeFetcher map[string]*fakeResulttype fakeResult struct {    body string    urls []string}func (f fakeFetcher) Fetch(url string) (string, []string, error) {    if res, ok := f[url]; ok { return res.body, res.urls, nil    }    return "", nil, fmt.Errorf("not found: %s", url)}// fetcher is a populated fakeFetcher.var fetcher = fakeFetcher{    "http://golang.org/": &fakeResult{        "The Go Programming Language",        []string{            "http://golang.org/pkg/",            "http://golang.org/cmd/",        },    },    "http://golang.org/pkg/": &fakeResult{        "Packages",        []string{            "http://golang.org/",            "http://golang.org/cmd/",            "http://golang.org/pkg/fmt/",            "http://golang.org/pkg/os/",        },    },    "http://golang.org/pkg/fmt/": &fakeResult{        "Package fmt",        []string{            "http://golang.org/",            "http://golang.org/pkg/",        },    },    "http://golang.org/pkg/os/": &fakeResult{        "Package os",        []string{            "http://golang.org/",            "http://golang.org/pkg/",        },    },}

满足题目要求的一种修改Crawl()函数的参考实现如下：

package mainimport (    "fmt")type Fetcher interface {    // Fetch returns the body of URL and    // a slice of URLs found on that page.    Fetch(url string) (body string, urls []string, err error)}// Crawl uses fetcher to recursively crawl// pages starting with url, to a maximum of depth.func Crawl(url string, depth int, fetcher Fetcher) {    if depth <= 0 {        return    }    type fetched_res struct {        url   string        body  string        urls  []string        depth int        err   error    }    // fetched_set is used for url deduplication    fetched_set := make(map[string]bool)        // fetched_ch is used to sync among goroutines    fetched_ch := make(chan *fetched_res)        // fetch_routine is a function value used to do the real crawl job and send result to channel    fetch_routine := func(url string, depth int) {        body, urls, err := fetcher.Fetch(url)        fetched_ch <- &fetched_res{url, body, urls, depth, err}    }    // start with the seed url    go fetch_routine(url, depth)    for progress := 1; progress > 0; progress-- {        res_ptr := <- fetched_ch        // when exception occured during fetch_routine, the total progress should minus 1        // so, error should be handle just here (rather than in function "fetch_routine") to control the fetching progress        if res_ptr.err != nil {            fmt.Println(res_ptr.err)            continue        }        fmt.Printf("found: %s %q\n", res_ptr.url, res_ptr.body)        // add to fetched url set        fetched_set[res_ptr.url] = true        // crawl recursively if max_depth not reached        cur_depth := res_ptr.depth - 1        if cur_depth > 0 {            for _, candidate := range res_ptr.urls {                if !fetched_set[candidate] {                    progress++                    go fetch_routine(candidate, cur_depth)                } else {                    fmt.Printf("fetched already: %s\n", candidate)                    continue                }            }        }    }    return}func main() {    Crawl("http://golang.org/", 2, fetcher)}// fakeFetcher is Fetcher that returns canned results.type fakeFetcher map[string]*fakeResulttype fakeResult struct {    body string    urls []string}func (f fakeFetcher) Fetch(url string) (string, []string, error) {    if res, ok := f[url]; ok {return res.body, res.urls, nil    }    return "", nil, fmt.Errorf("not found: %s", url)}// fetcher is a populated fakeFetcher.var fetcher = fakeFetcher{    "http://golang.org/": &fakeResult{        "The Go Programming Language",        []string{            "http://golang.org/pkg/",            "http://golang.org/cmd/",        },    },    "http://golang.org/pkg/": &fakeResult{        "Packages",        []string{            "http://golang.org/",            "http://golang.org/cmd/",            "http://golang.org/pkg/fmt/",            "http://golang.org/pkg/os/",        },    },    "http://golang.org/pkg/fmt/": &fakeResult{        "Package fmt",        []string{            "http://golang.org/",            "http://golang.org/pkg/",        },    },    "http://golang.org/pkg/os/": &fakeResult{        "Package os",        []string{            "http://golang.org/",            "http://golang.org/pkg/",        },    },}

题目并非真的通过网络去抓页面，而是通过fakeFetcher来模拟层级页面。

还需要说明的是，当要抓取的url不是fakeFetcher这个map结构的key时，抓取会返回error，这相当于模拟了页面404的异常，在实现Crawl()函数的并发抓取逻辑时，需要正确处理这个异常以控制抓取loop的终止条件，否则，代码在golang官网提供的playgroup运行时会报错（channel死锁）。

====================== EOF =======================

0 0