go写的爬虫小程序

来源:互联网 发布:java链表反序 编辑:程序博客网 时间:2024/05/22 11:30
package main


import (
"fmt"
"io/ioutil"
"math/rand"
"net/http"
"regexp"
"runtime"
"time"
)


var userAgent = []string{"Mozilla/5.0 (compatible, MSIE 10.0, Windows NT, DigExt)",
"Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, 360SE)",
"Mozilla/4.0 (compatible, MSIE 8.0, Windows NT 6.0, Trident/4.0)",
"Mozilla/5.0 (compatible, MSIE 9.0, Windows NT 6.1, Trident/5.0,",
"Opera/9.80 (Windows NT 6.1, U, en) Presto/2.8.131 Version/11.11",
"Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, TencentTraveler 4.0)",
"Mozilla/5.0 (Windows, U, Windows NT 6.1, en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Macintosh, Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh, U, Intel Mac OS X 10_6_8, en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Linux, U, Android 3.0, en-us, Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
"Mozilla/5.0 (iPad, U, CPU OS 4_3_3 like Mac OS X, en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, Trident/4.0, SE 2.X MetaSr 1.0, SE 2.X MetaSr 1.0, .NET CLR 2.0.50727, SE 2.X MetaSr 1.0)",
"Mozilla/5.0 (iPhone, U, CPU iPhone OS 4_3_3 like Mac OS X, en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"MQQBrowser/26 Mozilla/5.0 (Linux, U, Android 2.3.7, zh-cn, MB200 Build/GRJ22, CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"}


//随机数
var r = rand.New(rand.NewSource(time.Now().UnixNano()))


//正则表达式
var atagRegExp = regexp.MustCompile(`<a[^>]+[(href)|(HREF)]\s*\t*\n*=\s*\t*\n*[(".+")|('.+')][^>]*>[^<]*</a>`)
var hrefRegExp = regexp.MustCompile(`href="(.+)"\s*\t*\n*`)


func main() {
runtime.GOMAXPROCS(4)
c := make(chan int, 1000)
var k = 0
for i := 0; i < 3000; i++ {
go GetContent(c)
k += <-c
}
fmt.Println(k)
time.Sleep(5 * time.Second)
}


func GetContent(c chan int) int {
lenth := 0
url := "http://www.trade.com/trade.php"
req, _ := http.NewRequest("GET", url, nil)
req.Header.Set("User-Agent", GetRandomUserAgent())
client := http.DefaultClient
res, e := client.Do(req)
if e != nil {
fmt.Errorf("Get请求%s返回错误:%s", url, e)
return 0
}


if res.StatusCode == 200 {
body := res.Body
defer body.Close()
bodyBety, _ := ioutil.ReadAll(body)
resStr := string(bodyBety)
atag := atagRegExp.FindAllStringIndex(resStr, -1)
fmt.Println(len(resStr))
for _, a := range atag {
hrefStr := hrefRegExp.FindString(a)
fmt.Println(hrefStr)
}
}


c <- lenth
return 1


}


func GetRandomUserAgent() string {
return userAgent[r.Intn(len(userAgent))] //范围内随机数
}
原创粉丝点击