Maven_Webmagic 实例

来源:互联网 发布:北京教育软件开发公司 编辑:程序博客网 时间:2024/05/22 17:41

webmagic中文文档

这里写图片描述

项目目录

这里写图片描述

Device.java

package com.demo.webmagic.bean;import java.util.Date;public class Device {    private Integer id; //    private String code; //    private String name; //    private String model; //    private String manufacturer; //    private String country; //    private String contact; //    private String contactNumber; //    private String email; //    private String institute; //    private String location; //    private Object specification; //    private Object performance; //    private Object application; //    private Object description; //    private String purchaseDate; //    private String price; //    private Object feeStandard; //    private String imageUrl; //    private String imageLocal; //    private String province; //    private String dataSource; //    private String recorder; //    private Date recordDateTime; //    private String nameEn; //    private String postCode; //    public Integer getId() {        return this.id;    }    public void setId(Integer id) {        this.id = id;    }    public String getCode() {        return this.code;    }    public void setCode(String code) {        this.code = code;    }    public String getName() {        return this.name;    }    public void setName(String name) {        this.name = name;    }    public String getModel() {        return this.model;    }    public void setModel(String model) {        this.model = model;    }    public String getManufacturer() {        return this.manufacturer;    }    public void setManufacturer(String manufacturer) {        this.manufacturer = manufacturer;    }    public String getCountry() {        return this.country;    }    public void setCountry(String country) {        this.country = country;    }    public String getContact() {        return this.contact;    }    public void setContact(String contact) {        this.contact = contact;    }    public String getContactNumber() {        return this.contactNumber;    }    public void setContactNumber(String contactNumber) {        this.contactNumber = contactNumber;    }    public String getEmail() {        return this.email;    }    public void setEmail(String email) {        this.email = email;    }    public String getInstitute() {        return this.institute;    }    public void setInstitute(String institute) {        this.institute = institute;    }    public String getLocation() {        return this.location;    }    public void setLocation(String location) {        this.location = location;    }    public Object getSpecification() {        return this.specification;    }    public void setSpecification(Object specification) {        this.specification = specification;    }    public Object getPerformance() {        return this.performance;    }    public void setPerformance(Object performance) {        this.performance = performance;    }    public Object getApplication() {        return this.application;    }    public void setApplication(Object application) {        this.application = application;    }    public Object getDescription() {        return this.description;    }    public void setDescription(Object description) {        this.description = description;    }    public String getPurchaseDate() {        return this.purchaseDate;    }    public void setPurchaseDate(String purchaseDate) {        this.purchaseDate = purchaseDate;    }    public String getPrice() {        return this.price;    }    public void setPrice(String price) {        this.price = price;    }    public Object getFeeStandard() {        return this.feeStandard;    }    public void setFeeStandard(Object feeStandard) {        this.feeStandard = feeStandard;    }    public String getImageUrl() {        return this.imageUrl;    }    public void setImageUrl(String imageUrl) {        this.imageUrl = imageUrl;    }    public String getImageLocal() {        return this.imageLocal;    }    public void setImageLocal(String imageLocal) {        this.imageLocal = imageLocal;    }    public String getProvince() {        return this.province;    }    public void setProvince(String province) {        this.province = province;    }    public String getDataSource() {        return this.dataSource;    }    public void setDataSource(String dataSource) {        this.dataSource = dataSource;    }    public String getRecorder() {        return this.recorder;    }    public void setRecorder(String recorder) {        this.recorder = recorder;    }    public Date getRecordDateTime() {        return this.recordDateTime;    }    public void setRecordDateTime(Date recordDateTime) {        this.recordDateTime = recordDateTime;    }    public String getNameEn() {        return this.nameEn;    }    public void setNameEn(String nameEn) {        this.nameEn = nameEn;    }    public String getPostCode() {        return this.postCode;    }    public void setPostCode(String postCode) {        this.postCode = postCode;    }    @Override    public String toString() {        return "Device [id=" + id + ", code=" + code + ", name=" + name + ", model=" + model + ", manufacturer="                + manufacturer + ", country=" + country + ", contact=" + contact + ", contactNumber=" + contactNumber                + ", email=" + email + ", institute=" + institute + ", location=" + location + ", specification="                + specification + ", performance=" + performance + ", application=" + application + ", description="                + description + ", purchaseDate=" + purchaseDate + ", price=" + price + ", feeStandard=" + feeStandard                + ", imageUrl=" + imageUrl + ", imageLocal=" + imageLocal + ", province=" + province + ", dataSource="                + dataSource + ", recorder=" + recorder + ", recordDateTime=" + recordDateTime + ", nameEn=" + nameEn                + ", postCode=" + postCode + "]";    }}

ShanxiProcessor.java

package com.demo.webmagic.processor;import java.util.Date;import java.util.List;import com.demo.webmagic.bean.Device;import com.demo.webmagic.util.ImageDownloader;import com.demo.webmagic.util.ImageDownloader.ImgNameType;import us.codecraft.webmagic.Page;import us.codecraft.webmagic.Site;import us.codecraft.webmagic.Spider;import us.codecraft.webmagic.processor.PageProcessor;import us.codecraft.webmagic.selector.Html;public class ShanxiProcessor implements PageProcessor {    private static int currentPage = 87;    private static final int LAST_PAGE = 112;    public static final String SAVE_PATH = "D:/image/shanxi/";    public static final String DOMAIN = "http://www.tydxyq.cn";    public static final String URL_LIST = "http://www\\.tydxyq\\.cn/yqsb/list.asp\\?page=\\.*";    public static final String URL_POST = "http://www\\.tydxyq\\.cn/yqsb/detail.asp\\?ID=\\.*";    public static final String PREFIX_LIST = "http://www.tydxyq.cn/yqsb/list.asp?page=";    public static final String PREFIX_POST = "http://www.tydxyq.cn/yqsb/detail.asp?ID=";    private Site site = Site.me()            .setRetryTimes(3)            .setTimeOut(10000)            .setCharset("GBK")            .setDomain("www.tydxyq.cn")            .setSleepTime(3000)            .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");    @Override    public void process(Page page) {//       System.out.println(page.getHtml());        if (page.getUrl().regex(URL_LIST).match()) {            if (currentPage > LAST_PAGE) {                return;            }            addTargetRequests(page);            return;        }         if (!page.getUrl().regex(URL_LIST).match()) {            Device device = null;            try {                device = createDevice(page);            } catch (Exception e1) {                e1.printStackTrace();            }            System.out.println(device);            try {//              deviceService.add(device);            } catch (Exception e) {                e.printStackTrace();            }        }    }    private void addTargetRequests(Page page) {        List<String> urlList = page.getHtml().xpath("//tr[@align='center']").links().all();        for (String urlString : urlList) {            if (urlString.contains(PREFIX_POST)) {                page.addTargetRequest(urlString);            }        }        page.addTargetRequest(PREFIX_LIST + currentPage++);    }    private Device createDevice(Page page) {        Html html = page.getHtml();        Device device = new Device();        device.setCode(createCode(page));        String imageUrl = html.xpath("//table[4]//tr/td[2]/table[2]//a/@href").toString();        device.setImageUrl(imageUrl);        try {            String imageLocal = ImageDownloader.download(imageUrl, SAVE_PATH, ImgNameType.OBTAIN);            device.setImageLocal(imageLocal);        } catch (Exception e) {            e.printStackTrace();        }        device.setName(html.xpath("//table[4]//tr/td[2]/table[1]//tr[1]/td//strong/text()").toString().trim());        device.setModel(html.xpath("//table[4]//tr/td[2]/table[1]//tr[1]/td/text()").toString().substring(3));//      device.setUnivercity(html.xpath("//table[4]//tr/td[2]/table[1]//tr[4]/td[2]/allText()").toString().trim());        device.setPurchaseDate(html.xpath("//table[4]//tr/td[2]/table[1]//tr[9]/td[2]/text()").toString().trim());        device.setPrice(html.xpath("//table[4]//tr/td[2]/table[1]//tr[10]/td[2]/text()").toString().trim());        device.setApplication(html.xpath("//table[4]//tr/td[2]/table[1]//tr[14]/td[2]/allText()").toString().trim());        device.setFeeStandard(html.xpath("//table[4]//tr/td[2]/table[3]//table[@id='tb1']//tr[3]/td/allText()").toString());        device.setInstitute(html.xpath("//table[4]//tr/td[2]/table[3]//table[@id='tb1']//tr[4]/td/text()").toString().substring(1));        device.setContact(html.xpath("//table[4]//tr/td[2]/table[3]//table[@id='tb1']//tr[5]/td/text()").toString().substring(1));        device.setEmail(html.xpath("//table[4]//tr/td[2]/table[3]//table[@id='tb1']//tr[6]/td/text()").toString().substring(1));        device.setContactNumber(html.xpath("//table[4]//tr/td[2]/table[3]//table[@id='tb1']//tr[7]/td/allText()").toString().substring(6));        device.setSpecification(html.xpath("//table[4]//tr/td[2]/table[3]//table[@id='tb2']//td/allText()").toString().substring(5));        device.setCountry(html.xpath("//table[4]//tr/td[2]/table[3]//table[@id='tb4']//tr[1]/td/text()").toString().substring(1));        device.setManufacturer(html.xpath("//table[4]//tr/td[2]/table[3]//table[@id='tb4']//tr[2]/td/text()").toString().substring(1));        device.setProvince("山西省");        device.setDataSource(DOMAIN);        device.setRecorder("liuzhiguo");        device.setRecordDateTime(new Date());        return device;    }    private String createCode(Page page) {        String urlString = page.getUrl().toString();        return urlString.substring(urlString.lastIndexOf("=") + 1);    }    @Override    public Site getSite() {        return site;    }    public static void main(String[] args) {        Spider.create(new ShanxiProcessor()).addUrl(PREFIX_LIST + currentPage++).thread(10).run();    }}

ImageDownloader.java

package com.demo.webmagic.util;import java.io.File;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.OutputStream;import java.net.URL;import java.net.URLConnection;import java.util.UUID;public class ImageDownloader {    public enum ImgNameType {        OBTAIN, UUID    }    public static String download(String imageUrl, String savePath, ImgNameType imgNameType) throws Exception {        if (imageUrl == null) {            return null;        }        String imageName = obtainImageName(imageUrl, imgNameType);        String imgSavePath = createImgSavePath(savePath, imageName);        if (new File(imgSavePath).exists()) {            System.out.println("图片已存在:" + imgSavePath);            return imgSavePath;        }        downloadImage(imageUrl, imgSavePath);        return imgSavePath;    }    // 图片路径中出现中文 会出错    private static void downloadImage(String imageUrl, String imgSavePath) throws Exception {        URLConnection con = new URL(imageUrl).openConnection();        con.setConnectTimeout(5 * 1000);        InputStream is = con.getInputStream();        OutputStream os = new FileOutputStream(imgSavePath);        byte[] bs = new byte[2048];        int len;        while ((len = is.read(bs)) != -1) {            os.write(bs, 0, len);        }        closeIOStream(is, os);    }    private static String createImgSavePath(String savePath, String imageName) {        File sf = createFolder(savePath);        return sf.getPath() + "\\" + imageName;    }    private static String obtainImageName(String urlString, ImgNameType imgNameType) {        if (imgNameType == ImgNameType.UUID) {            return UUID.randomUUID().toString() + ".jpg";        }        if (urlString.contains("?")) {            return urlString.substring(urlString.lastIndexOf("=") + 1) + ".jpg";        }        // if (!urlString.contains("\\.")) {        // return urlString.substring(urlString.lastIndexOf("/") + 1) + ".jpg";        // }        return urlString.substring(urlString.lastIndexOf("/") + 1);    }    private static void closeIOStream(InputStream is, OutputStream os) {        if (is != null) {            try {                is.close();            } catch (IOException e) {                e.printStackTrace();            }        }        if (os != null) {            try {                os.close();            } catch (IOException e) {                e.printStackTrace();            }        }        System.out.println("下载完成");    }    private static File createFolder(String savePath) {        File sf = new File(savePath);        if (!sf.exists()) {            sf.mkdirs();        }        return sf;    }}

pom.xml

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">    <modelVersion>4.0.0</modelVersion>    <groupId>com.demo</groupId>    <artifactId>maven-webmagic</artifactId>    <version>0.0.1-SNAPSHOT</version>    <packaging>jar</packaging>    <name>maven-webmagic</name>    <url>http://maven.apache.org</url>    <properties>        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>    </properties>    <dependencies>        <dependency>            <groupId>us.codecraft</groupId>            <artifactId>webmagic-core</artifactId>            <version>0.5.3</version>        </dependency>        <dependency>            <groupId>us.codecraft</groupId>            <artifactId>webmagic-extension</artifactId>            <version>0.5.3</version>        </dependency>    </dependencies>    <build>        <plugins>            <plugin>                <artifactId>maven-compiler-plugin</artifactId>                <version>3.5.1</version>                <configuration>                    <source>1.7</source>                    <target>1.7</target>                    <encoding>UTF-8</encoding>                </configuration>            </plugin>            <plugin>                <groupId>org.apache.maven.plugins</groupId>                <artifactId>maven-source-plugin</artifactId>                <version>3.0.1</version>                <executions>                    <execution>                        <id>attach-sources</id>                        <goals>                            <goal>jar</goal>                        </goals>                    </execution>                </executions>            </plugin>        </plugins>    </build></project>

maven-webmagic 实例源码

原创粉丝点击