Maven_Webmagic 实例
来源:互联网 发布:北京教育软件开发公司 编辑:程序博客网 时间:2024/05/22 17:41
webmagic中文文档
项目目录
Device.java
package com.demo.webmagic.bean;import java.util.Date;public class Device { private Integer id; // private String code; // private String name; // private String model; // private String manufacturer; // private String country; // private String contact; // private String contactNumber; // private String email; // private String institute; // private String location; // private Object specification; // private Object performance; // private Object application; // private Object description; // private String purchaseDate; // private String price; // private Object feeStandard; // private String imageUrl; // private String imageLocal; // private String province; // private String dataSource; // private String recorder; // private Date recordDateTime; // private String nameEn; // private String postCode; // public Integer getId() { return this.id; } public void setId(Integer id) { this.id = id; } public String getCode() { return this.code; } public void setCode(String code) { this.code = code; } public String getName() { return this.name; } public void setName(String name) { this.name = name; } public String getModel() { return this.model; } public void setModel(String model) { this.model = model; } public String getManufacturer() { return this.manufacturer; } public void setManufacturer(String manufacturer) { this.manufacturer = manufacturer; } public String getCountry() { return this.country; } public void setCountry(String country) { this.country = country; } public String getContact() { return this.contact; } public void setContact(String contact) { this.contact = contact; } public String getContactNumber() { return this.contactNumber; } public void setContactNumber(String contactNumber) { this.contactNumber = contactNumber; } public String getEmail() { return this.email; } public void setEmail(String email) { this.email = email; } public String getInstitute() { return this.institute; } public void setInstitute(String institute) { this.institute = institute; } public String getLocation() { return this.location; } public void setLocation(String location) { this.location = location; } public Object getSpecification() { return this.specification; } public void setSpecification(Object specification) { this.specification = specification; } public Object getPerformance() { return this.performance; } public void setPerformance(Object performance) { this.performance = performance; } public Object getApplication() { return this.application; } public void setApplication(Object application) { this.application = application; } public Object getDescription() { return this.description; } public void setDescription(Object description) { this.description = description; } public String getPurchaseDate() { return this.purchaseDate; } public void setPurchaseDate(String purchaseDate) { this.purchaseDate = purchaseDate; } public String getPrice() { return this.price; } public void setPrice(String price) { this.price = price; } public Object getFeeStandard() { return this.feeStandard; } public void setFeeStandard(Object feeStandard) { this.feeStandard = feeStandard; } public String getImageUrl() { return this.imageUrl; } public void setImageUrl(String imageUrl) { this.imageUrl = imageUrl; } public String getImageLocal() { return this.imageLocal; } public void setImageLocal(String imageLocal) { this.imageLocal = imageLocal; } public String getProvince() { return this.province; } public void setProvince(String province) { this.province = province; } public String getDataSource() { return this.dataSource; } public void setDataSource(String dataSource) { this.dataSource = dataSource; } public String getRecorder() { return this.recorder; } public void setRecorder(String recorder) { this.recorder = recorder; } public Date getRecordDateTime() { return this.recordDateTime; } public void setRecordDateTime(Date recordDateTime) { this.recordDateTime = recordDateTime; } public String getNameEn() { return this.nameEn; } public void setNameEn(String nameEn) { this.nameEn = nameEn; } public String getPostCode() { return this.postCode; } public void setPostCode(String postCode) { this.postCode = postCode; } @Override public String toString() { return "Device [id=" + id + ", code=" + code + ", name=" + name + ", model=" + model + ", manufacturer=" + manufacturer + ", country=" + country + ", contact=" + contact + ", contactNumber=" + contactNumber + ", email=" + email + ", institute=" + institute + ", location=" + location + ", specification=" + specification + ", performance=" + performance + ", application=" + application + ", description=" + description + ", purchaseDate=" + purchaseDate + ", price=" + price + ", feeStandard=" + feeStandard + ", imageUrl=" + imageUrl + ", imageLocal=" + imageLocal + ", province=" + province + ", dataSource=" + dataSource + ", recorder=" + recorder + ", recordDateTime=" + recordDateTime + ", nameEn=" + nameEn + ", postCode=" + postCode + "]"; }}
ShanxiProcessor.java
package com.demo.webmagic.processor;import java.util.Date;import java.util.List;import com.demo.webmagic.bean.Device;import com.demo.webmagic.util.ImageDownloader;import com.demo.webmagic.util.ImageDownloader.ImgNameType;import us.codecraft.webmagic.Page;import us.codecraft.webmagic.Site;import us.codecraft.webmagic.Spider;import us.codecraft.webmagic.processor.PageProcessor;import us.codecraft.webmagic.selector.Html;public class ShanxiProcessor implements PageProcessor { private static int currentPage = 87; private static final int LAST_PAGE = 112; public static final String SAVE_PATH = "D:/image/shanxi/"; public static final String DOMAIN = "http://www.tydxyq.cn"; public static final String URL_LIST = "http://www\\.tydxyq\\.cn/yqsb/list.asp\\?page=\\.*"; public static final String URL_POST = "http://www\\.tydxyq\\.cn/yqsb/detail.asp\\?ID=\\.*"; public static final String PREFIX_LIST = "http://www.tydxyq.cn/yqsb/list.asp?page="; public static final String PREFIX_POST = "http://www.tydxyq.cn/yqsb/detail.asp?ID="; private Site site = Site.me() .setRetryTimes(3) .setTimeOut(10000) .setCharset("GBK") .setDomain("www.tydxyq.cn") .setSleepTime(3000) .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); @Override public void process(Page page) {// System.out.println(page.getHtml()); if (page.getUrl().regex(URL_LIST).match()) { if (currentPage > LAST_PAGE) { return; } addTargetRequests(page); return; } if (!page.getUrl().regex(URL_LIST).match()) { Device device = null; try { device = createDevice(page); } catch (Exception e1) { e1.printStackTrace(); } System.out.println(device); try {// deviceService.add(device); } catch (Exception e) { e.printStackTrace(); } } } private void addTargetRequests(Page page) { List<String> urlList = page.getHtml().xpath("//tr[@align='center']").links().all(); for (String urlString : urlList) { if (urlString.contains(PREFIX_POST)) { page.addTargetRequest(urlString); } } page.addTargetRequest(PREFIX_LIST + currentPage++); } private Device createDevice(Page page) { Html html = page.getHtml(); Device device = new Device(); device.setCode(createCode(page)); String imageUrl = html.xpath("//table[4]//tr/td[2]/table[2]//a/@href").toString(); device.setImageUrl(imageUrl); try { String imageLocal = ImageDownloader.download(imageUrl, SAVE_PATH, ImgNameType.OBTAIN); device.setImageLocal(imageLocal); } catch (Exception e) { e.printStackTrace(); } device.setName(html.xpath("//table[4]//tr/td[2]/table[1]//tr[1]/td//strong/text()").toString().trim()); device.setModel(html.xpath("//table[4]//tr/td[2]/table[1]//tr[1]/td/text()").toString().substring(3));// device.setUnivercity(html.xpath("//table[4]//tr/td[2]/table[1]//tr[4]/td[2]/allText()").toString().trim()); device.setPurchaseDate(html.xpath("//table[4]//tr/td[2]/table[1]//tr[9]/td[2]/text()").toString().trim()); device.setPrice(html.xpath("//table[4]//tr/td[2]/table[1]//tr[10]/td[2]/text()").toString().trim()); device.setApplication(html.xpath("//table[4]//tr/td[2]/table[1]//tr[14]/td[2]/allText()").toString().trim()); device.setFeeStandard(html.xpath("//table[4]//tr/td[2]/table[3]//table[@id='tb1']//tr[3]/td/allText()").toString()); device.setInstitute(html.xpath("//table[4]//tr/td[2]/table[3]//table[@id='tb1']//tr[4]/td/text()").toString().substring(1)); device.setContact(html.xpath("//table[4]//tr/td[2]/table[3]//table[@id='tb1']//tr[5]/td/text()").toString().substring(1)); device.setEmail(html.xpath("//table[4]//tr/td[2]/table[3]//table[@id='tb1']//tr[6]/td/text()").toString().substring(1)); device.setContactNumber(html.xpath("//table[4]//tr/td[2]/table[3]//table[@id='tb1']//tr[7]/td/allText()").toString().substring(6)); device.setSpecification(html.xpath("//table[4]//tr/td[2]/table[3]//table[@id='tb2']//td/allText()").toString().substring(5)); device.setCountry(html.xpath("//table[4]//tr/td[2]/table[3]//table[@id='tb4']//tr[1]/td/text()").toString().substring(1)); device.setManufacturer(html.xpath("//table[4]//tr/td[2]/table[3]//table[@id='tb4']//tr[2]/td/text()").toString().substring(1)); device.setProvince("山西省"); device.setDataSource(DOMAIN); device.setRecorder("liuzhiguo"); device.setRecordDateTime(new Date()); return device; } private String createCode(Page page) { String urlString = page.getUrl().toString(); return urlString.substring(urlString.lastIndexOf("=") + 1); } @Override public Site getSite() { return site; } public static void main(String[] args) { Spider.create(new ShanxiProcessor()).addUrl(PREFIX_LIST + currentPage++).thread(10).run(); }}
ImageDownloader.java
package com.demo.webmagic.util;import java.io.File;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.OutputStream;import java.net.URL;import java.net.URLConnection;import java.util.UUID;public class ImageDownloader { public enum ImgNameType { OBTAIN, UUID } public static String download(String imageUrl, String savePath, ImgNameType imgNameType) throws Exception { if (imageUrl == null) { return null; } String imageName = obtainImageName(imageUrl, imgNameType); String imgSavePath = createImgSavePath(savePath, imageName); if (new File(imgSavePath).exists()) { System.out.println("图片已存在:" + imgSavePath); return imgSavePath; } downloadImage(imageUrl, imgSavePath); return imgSavePath; } // 图片路径中出现中文 会出错 private static void downloadImage(String imageUrl, String imgSavePath) throws Exception { URLConnection con = new URL(imageUrl).openConnection(); con.setConnectTimeout(5 * 1000); InputStream is = con.getInputStream(); OutputStream os = new FileOutputStream(imgSavePath); byte[] bs = new byte[2048]; int len; while ((len = is.read(bs)) != -1) { os.write(bs, 0, len); } closeIOStream(is, os); } private static String createImgSavePath(String savePath, String imageName) { File sf = createFolder(savePath); return sf.getPath() + "\\" + imageName; } private static String obtainImageName(String urlString, ImgNameType imgNameType) { if (imgNameType == ImgNameType.UUID) { return UUID.randomUUID().toString() + ".jpg"; } if (urlString.contains("?")) { return urlString.substring(urlString.lastIndexOf("=") + 1) + ".jpg"; } // if (!urlString.contains("\\.")) { // return urlString.substring(urlString.lastIndexOf("/") + 1) + ".jpg"; // } return urlString.substring(urlString.lastIndexOf("/") + 1); } private static void closeIOStream(InputStream is, OutputStream os) { if (is != null) { try { is.close(); } catch (IOException e) { e.printStackTrace(); } } if (os != null) { try { os.close(); } catch (IOException e) { e.printStackTrace(); } } System.out.println("下载完成"); } private static File createFolder(String savePath) { File sf = new File(savePath); if (!sf.exists()) { sf.mkdirs(); } return sf; }}
pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.demo</groupId> <artifactId>maven-webmagic</artifactId> <version>0.0.1-SNAPSHOT</version> <packaging>jar</packaging> <name>maven-webmagic</name> <url>http://maven.apache.org</url> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> </properties> <dependencies> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> <version>0.5.3</version> </dependency> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-extension</artifactId> <version>0.5.3</version> </dependency> </dependencies> <build> <plugins> <plugin> <artifactId>maven-compiler-plugin</artifactId> <version>3.5.1</version> <configuration> <source>1.7</source> <target>1.7</target> <encoding>UTF-8</encoding> </configuration> </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-source-plugin</artifactId> <version>3.0.1</version> <executions> <execution> <id>attach-sources</id> <goals> <goal>jar</goal> </goals> </execution> </executions> </plugin> </plugins> </build></project>
maven-webmagic 实例源码
阅读全文
0 0
- Maven_Webmagic 实例
- 实例
- 实例
- 实例
- 实例
- 实例
- 实例
- 实例
- 实例
- 实例
- 实例
- 实例
- 实例
- 实例
- 实例
- 实例
- 实例
- 实例
- 九度1035:找出直系亲属<并查集>
- 小随笔
- Eqs POJ
- GSM Hacking:使用BladeRF、树莓派、YatesBTS搭建便携式GSM基站
- Kotlin学习笔记(四)属性
- Maven_Webmagic 实例
- springboot入门-hellospringboot项目搭建
- Python基础-模块
- 20170612初学笔记之python+linux
- spring对象类型属性的注入
- Android Material Design 之 Snackbar
- Git 问题整理
- WPF绘制五子棋盘
- 在枚举中使用位移运算,来判断层叠块