Go-Pholcus抓取IJGUC所有期刊
来源:互联网 发布:边伯贤直播软件 编辑:程序博客网 时间:2024/06/08 17:09
package spider_lib// 基础包import ( // "log" "github.com/PuerkitoBio/goquery" //DOM解析 "github.com/henrylee2cn/pholcus/app/downloader/request" //必需 . "github.com/henrylee2cn/pholcus/app/spider" //必需 // "github.com/henrylee2cn/pholcus/logs" //信息输出 // . "github.com/henrylee2cn/pholcus/app/spider/common" //选用 // net包 // "net/http" //设置http.Header // "net/url" // 编码包 // "encoding/xml" // "encoding/json" // 字符串处理包 "regexp" "strconv" // "strings" // 其他包 // "fmt" // "math" // "time")func init() { IJGUC.Register()}var IJGUC = &Spider{ Name: "IJGUC期刊", Description: "IJGUC期刊", // Pausetime: 300, // Keyin: KEYIN, // Limit: LIMIT, EnableCookie: false, RuleTree: &RuleTree{ Root: func(ctx *Context) { ctx.AddQueue(&request.Request{ Url: "http://www.inderscience.com/info/inarticletoc.php?jcode=ijguc&year=2016&vol=7&issue=1", Rule: "期刊列表", }) }, Trunk: map[string]*Rule{ "期刊列表": { ParseFunc: func(ctx *Context) { query := ctx.GetDom() for i := 1; i <= 7; i++ { id := "#eventbody" + strconv.Itoa(i) + " a" query.Find(id).Each(func(j int, s *goquery.Selection) { if url, ok := s.Attr("href"); ok { // log.Print(url) ctx.AddQueue(&request.Request{Url: url, Rule: "文章列表"}) } }) } }, }, "文章列表": { ParseFunc: func(ctx *Context) { query := ctx.GetDom() //#journalcol1 article table tbody tr td:eq(1) table:eq(1) a query.Find("#journalcol1 article table tbody tr td").Each(func(i int, td *goquery.Selection) { if i == 1 { td.Find("table").Each(func(j int, table *goquery.Selection) { if j == 1 { table.Find("a").Each(func(k int, a *goquery.Selection) { if k%2 == 0 { if url, ok := a.Attr("href"); ok { // log.Print(url) ctx.AddQueue(&request.Request{Url: url, Rule: "文章页"}) } } }) } }) } }) }, }, "文章页": { //注意:有无字段语义和是否输出数据必须保持一致 ItemFields: []string{ "Title", "Author", "Addresses", "Journal", "Abstract", "Keywords", "DOI", }, ParseFunc: func(ctx *Context) { query := ctx.GetDom() // 获取内容 content := query.Find("#col1").Text() // 过滤标签 re, _ := regexp.Compile("\\<[\\S\\s]+?\\>") content = re.ReplaceAllString(content, "") // Title re, _ = regexp.Compile("Title:(.*?)Author:") title := re.FindStringSubmatch(content)[1] // Author re, _ = regexp.Compile("Author:(.*?)Addresses:") au := re.FindStringSubmatch(content) var author string if len(au) > 0 { author = au[1] } else { re, _ = regexp.Compile("Author:(.*?)Address:") author = re.FindStringSubmatch(content)[1] } // Addresses & Address re, _ = regexp.Compile("Addresses:(.*?)Journal:") address := re.FindStringSubmatch(content) var addresses string if len(address) > 0 { addresses = address[1] } else { re, _ = regexp.Compile("Address:(.*?)Journal:") addresses = re.FindStringSubmatch(content)[1] } // Journal re, _ = regexp.Compile("Journal:(.*?)Abstract:") journal := re.FindStringSubmatch(content)[1] // Abstract re, _ = regexp.Compile("Abstract:(.*?)Keywords:") abstract := re.FindStringSubmatch(content)[1] // Keywords re, _ = regexp.Compile("Keywords:(.*?)DOI:") keywords := re.FindStringSubmatch(content)[1] // DOI re, _ = regexp.Compile("DOI: ") doiIndex := re.FindStringSubmatchIndex(content) rs := []rune(content) left := doiIndex[1] - 8 right := left + 43 doi := string(rs[left:right]) // 结果存入Response中转 ctx.Output(map[int]interface{}{ 0: title, 1: author, 2: addresses, 3: journal, 4: abstract, 5: keywords, 6: doi, }) }, }, }, },}
0 0
- Go-Pholcus抓取IJGUC所有期刊
- Go-Pholcus爬人民网新闻规则
- Pholcus初探
- go抓取页面
- go语言抓取twitter
- 使用python抓取落网期刊图片
- .net 抓取网页所有链接
- 成功抓取douban 所有电影
- 期刊
- 期刊
- 期刊
- 计算机领域的所有SCI一区期刊,这是最顶级期刊了
- CSDN社区电子杂志项目(CSDN eMag)所有期刊总索引
- 计算机领域的所有SCI最顶级期刊
- 京东数据抓取-抓取所有图书名称
- 用JavaScript抓取頁面上所有控件
- C# 抓取页面中的所有链接
- 正则表达式 抓取网页面上所有图片
- SYSCALL_DEFINE3 宏定义(sys_poll)
- Java流机制详解
- 文件系统
- C语言深度解剖读书笔记(1.关键字的秘密)
- selenium---git、testng、maven、jenkins构建job
- Go-Pholcus抓取IJGUC所有期刊
- JAX-WS
- 开始学习C#
- K-means(K均值)
- 一目了然解释getName()、getCanonicalName()和getSimpleName()的异同
- 两种方式实现checkBox readonly功能
- [Spring MVC] - SpringMVC的各种参数绑定方式
- 浅谈Autolayout-02代码实现Autolayout
- 两个链表的第一个公共结点