Java爬虫初体验:简单抓取IT之家热评(整合Spring Boot+Elasticsearch+Redis+Mybatis)

来源:互联网 发布:电子杂志app制作软件 编辑:程序博客网 时间:2024/05/22 16:47

爬取主程序

使用Jsoup解析网页源代码

@Componentpublic class WebCrawler {    private static final String encoding = "utf-8";    @Autowired    private HotCommentMapper hotCommentMapper;    @Autowired    private RedisService redisService;    @Autowired    private EsService esService;    private static boolean done = false;    private static final int THREAD_NUM = 15;    private static AtomicInteger page = new AtomicInteger(0);    private static List<String> breakpoints;    /**     * 定时爬取更新     */    //@Scheduled(initialDelay = 1000, fixedRate = 1000*60*60*24*3)    public void start(){        done = false;        System.out.println("开始爬取:"+System.currentTimeMillis());        for (int i = 0;i<THREAD_NUM;++i){            new Thread(new Runnable() {                @Override                public void run() {                    while (!done) {                        int p = page.incrementAndGet();                        crawl(p);                    }                    System.out.println(Thread.currentThread().getName()+":结束:"+System.currentTimeMillis());                }            },"Thread--"+i).start();        }    }    public synchronized void stop(){        done = true;        redisService.listRemove("ithome:breakpoints");        redisService.listAdd("ithome:breakpoints",breakpoints);    }    /**     * @param page :页码     */    public void crawl(int page){        String url = "https://www.ithome.com/ithome/getajaxdata.aspx?" +                "page="+page+"&type=indexpage&randnum="+Math.random();        String src = getHtmlSrc(url,encoding);        List<String> links = getArticleLinks(src);        if (links.size()<=0){            stop();            return ;        }        //不知还有没更好的方法判断最近一次抓取的位置?        if(redisService.containsValue("ithome:breakpoints",links)){            stop();            return ;        }        //保存第一页链接做结束点        if (page == 1){            breakpoints = links;        }        for (String link:links){            parseAndSaveHotComments(link);        }    }    /**     *     * @param url     * @param encoding 编码     * @return 网页源代码     */    public String getHtmlSrc(String url,String encoding){        StringBuilder src = new StringBuilder();        InputStreamReader isr = null;        try {            URL urlObj = new URL(url);//建立网络链接            URLConnection urlConn = urlObj.openConnection();//打开链接            isr = new InputStreamReader(urlConn.getInputStream(),encoding);//建立文件输入流            BufferedReader reader = new BufferedReader(isr);//建立缓冲            String line = null;            while ((line = reader.readLine())!=null){                src.append(line);            }        }catch (Exception e){            e.printStackTrace();        }finally {            try {                if (isr != null){                    isr.close();                }            }catch (Exception e){                e.printStackTrace();            }        }        return src.toString();    }    /**     * @param srcCode     * @return 解析源代码,获取文章链接     */    public List<String> getArticleLinks(String srcCode){        List<String> links = new ArrayList<String>();        Document document = Jsoup.parse(srcCode);        Elements articleEls = document.select("h2>a");        for (Element el:articleEls){            String href = el.attr("href");            links.add(href);        }        return links;    }    /**     *     * @param articleHref 文章链接     * @description 使用Jsoup解析热评内容并保存     */    public void parseAndSaveHotComments(String articleHref){        String articlePage = getHtmlSrc(articleHref,encoding);        Document document = Jsoup.parse(articlePage);        Element iframeEl = document.getElementById("ifcomment");        if(iframeEl == null) {            return ;        }        String commentHref = iframeEl.attr("src");//评论页面URL        //获取文章ID        document = Jsoup.parse(getHtmlSrc("http:"+commentHref,encoding));        Element articleIdInput = document.getElementById("newsid");        String articleId = articleIdInput.attr("value");        //获取热评数据并解析        String link = "http://dyn.ithome.com/ithome/getajaxdata.aspx?newsID="+articleId+"&type=hotcomment";        String hotCommentPage = getHtmlSrc(link,encoding);        document = Jsoup.parse(hotCommentPage);        Elements hotCommentEls = document.select("li.entry");        HotComment hotComment = null;        for (Element el:hotCommentEls){            hotComment = new HotComment();            String  commontId = el.attr("cid");            String user = el.select("strong.nick a").text();            String comment = el.getElementsByTag("P").text();            int up = getNumber(el.select("a.s").text());            int down = getNumber(el.select("a.a").text());            String posandtime = el.select("span.posandtime").text();            String mobile = el.select("span.mobile a").text();            hotComment = new HotComment();            hotComment.setCommentId(commontId);            hotComment.setArticleUrl(articleHref);            hotComment.setUser(user);            hotComment.setComment(comment);            hotComment.setUp(up);            hotComment.setDown(down);            hotComment.setPosandtime(posandtime);            hotComment.setMobile(mobile);            hotCommentMapper.addHotComment(hotComment);//保存数据至数据库,这里保不保存其实都可以            esService.addHotComment(hotComment);//添加索引            if(hotComment.getUp()>=2500){                redisService.rankAdd("ithome:hotrank",hotComment);//缓存大于2500个赞的热评            }            //System.out.println(hotComment.toString());        }    }    /**     *     * @param str     * @return 解析"()"中的数字     */    public int getNumber(String str){        Pattern pattern = Pattern.compile("(?<=\\()(.+?)(?=\\))");        Matcher matcher = pattern.matcher(str);        if(matcher.find()){            return Integer.parseInt(matcher.group());        }        return 0;    }//    public static void main(String [] args){//        new WebCrawler().start();//    }}

整合Elasticsearch

相关依赖:

<!--es全文搜索-->        <dependency>            <groupId>org.springframework.boot</groupId>            <artifactId>spring-boot-starter-data-elasticsearch</artifactId>        </dependency>

application.yml配置:

spring:  data:  ##elasticsearch配置    elasticsearch:      cluster-name: elasticsearch      cluster-nodes: localhost:9300

代码部分:

Respository

public interface EsRepository extends ElasticsearchRepository<HotComment,Long>{    public List<HotComment> findByUser(String user);}

实体

@Document(indexName="hotcomments",type="hotcomment",indexStoreType="fs",shards=5,replicas=1,refreshInterval="-1")public class HotComment implements Serializable{    private static final long serialVersionUID = -4249699545233058684L;    @Id    private Long id;//热评编号    private String commentId;    private String user;//用户    private String comment;//内容    private int up;//支持数    private int down;//反对数    private String posandtime;//位置和时间    private String mobile;//设备    private String articleUrl;//源文章地址    public Long getId() {        return id;    }    public void setId(Long id) {        this.id = id;    }    public String getCommentId() {        return commentId;    }    public void setCommentId(String commentId) {        this.commentId = commentId;    }    public String getUser() {        return user;    }    public void setUser(String user) {        this.user = user;    }    public String getComment() {        return comment;    }    public void setComment(String comment) {        this.comment = comment;    }    public int getUp() {        return up;    }    public void setUp(int up) {        this.up = up;    }    public int getDown() {        return down;    }    public void setDown(int down) {        this.down = down;    }    public String getPosandtime() {        return posandtime;    }    public void setPosandtime(String posandtime) {        this.posandtime = posandtime;    }    public String getMobile() {        return mobile;    }    public void setMobile(String mobile) {        this.mobile = mobile;    }    public String getArticleUrl() {        return articleUrl;    }    public void setArticleUrl(String articleUrl) {        this.articleUrl = articleUrl;    }    @Override    public String toString() {        return "HotComment{" +                "id='" + id + '\'' +                "commentId='" + commentId + '\'' +                ", user='" + user + '\'' +                ", comment='" + comment + '\'' +                ", up=" + up +                ", down=" + down +                ", posandtime='" + posandtime + '\'' +                ", mobile='" + mobile + '\'' +                ", articleUrl='" + articleUrl + '\'' +                '}';    }}

Service

@Servicepublic class EsService {    @Autowired    private EsRepository esRepository;    public void addHotComment(HotComment hotComment){        esRepository.save(hotComment);    }    /**     * 缓存搜索结果     * @param user     * @return     */    @Cacheable(value = "ithome:hotcomments", key = "'ithome:user:'+#user")    public List<HotComment> findByUser(String user){        return esRepository.findByUser(user);    }}

整合Redis

相关依赖

<!-- redis -->        <dependency>            <groupId>org.springframework.boot</groupId>            <artifactId>spring-boot-starter-data-redis</artifactId>        </dependency>

相关配置

appliaction.yml配置

spring:  ##redis配置  redis:    database: 0    host: localhost    port: 6379    password: redis    pool:      max-active: 15      max-wait: 1      max-idle: 0    timeout: 0

缓存相关配置

@Configuration@EnableCachingpublic class RedisConfig {    @Bean    public KeyGenerator keyGenerator(){        return new KeyGenerator(){            @Override            public Object generate(Object o, Method method, Object... objects) {                StringBuilder sb = new StringBuilder();                sb.append(o.getClass().getName());                sb.append(method.getName());                for(Object obj : objects){                    sb.append(obj.toString());                }                return sb.toString();            }        };    }    @Bean    public CacheManager cacheManager(RedisTemplate redisTemplate){        RedisCacheManager redisCacheManager = new RedisCacheManager(redisTemplate);//        redisCacheManager.setDefaultExpiration(60*60*24);//缓存失效时间,单位:s        Map<String,Long> map = new HashMap<>();        map.put("ithome:hotcomments",60*60*24L);        return redisCacheManager;    }    @Bean    public RedisTemplate<String,String> redisTemplate(RedisConnectionFactory factory){        StringRedisTemplate template = new StringRedisTemplate(factory);        Jackson2JsonRedisSerializer jackson2JsonRedisSerializer = new Jackson2JsonRedisSerializer(Object.class);        ObjectMapper om = new ObjectMapper();        om.setVisibility(PropertyAccessor.ALL, JsonAutoDetect.Visibility.ANY);        om.enableDefaultTyping(ObjectMapper.DefaultTyping.NON_FINAL);        jackson2JsonRedisSerializer.setObjectMapper(om);        template.setValueSerializer(jackson2JsonRedisSerializer);        template.afterPropertiesSet();        return template;    }}

代码部分:

Service

@Servicepublic class RedisService {    @Autowired    private RedisTemplate redisTemplate;    /**     *     * @param key     * @param hotComment     * 添加热评至 redis     */    public void rankAdd(String key, HotComment hotComment){        ZSetOperations<String,HotComment> zSetOperations = redisTemplate.opsForZSet();        zSetOperations.add(key,hotComment,hotComment.getUp());    }    /**     *     * @param key     * @param top 前top条记录     * @return     */    public Set<HotComment> rankGet(String key,int top){        ZSetOperations<String,HotComment> zSetOperations = redisTemplate.opsForZSet();        return zSetOperations.range(key,0,top);    }    /**     *     * @param key     * @param values     * @desc 保存最近一次抓取的位置点     */    public void listAdd(String key,List<String> values){        ListOperations<String,String> listOperations = redisTemplate.opsForList();        listOperations.rightPushAll(key,values);    }    public void listRemove(String key){        redisTemplate.delete(key);    }    /**     *     * @param key     * @param values     * @return     * @desc 判断是否抓取结束位置     */    public boolean containsValue(String key,List<String> values){        ListOperations<String,String> listOperations = redisTemplate.opsForList();        List<String> list = listOperations.range(key,0,-1);        for (String val : values){            if(list.contains(val)){                return true;            }        }        return false;    }}

整合Mybatis

依赖配置

        <!-- mybatis -->        <dependency>            <groupId>org.mybatis.spring.boot</groupId>            <artifactId>mybatis-spring-boot-starter</artifactId>            <version>1.3.1</version>        </dependency>        <!-- mysql驱动 -->        <dependency>            <groupId>mysql</groupId>            <artifactId>mysql-connector-java</artifactId>            <scope>runtime</scope>        </dependency>        <!--数据源依赖-->        <dependency>            <groupId>com.alibaba</groupId>            <artifactId>druid</artifactId>            <version>1.0.24</version>        </dependency>

数据源配置

@Configuration@MapperScan(basePackages = "com.crazy.ithomecrawler.mybatis.mapper")public class DatabaseConfig {    /**     * 数据源配置     * @return     */    @Bean    public DataSource druidDataSource(){        DruidDataSource dataSource=new DruidDataSource();        dataSource.setUsername("root");        dataSource.setPassword("mysql");        dataSource.setUrl("jdbc:mysql://localhost:3306/ithome");        dataSource.setDriverClassName("com.mysql.jdbc.Driver");        return dataSource;    }}

代码部分:

Mapper

public interface HotCommentMapper {    @Insert("INSERT INTO hot_comment(vCommentId,vUser,vComment,iUp,iDown,vPosandTime,vMobile,vArticleUrl) VALUES(#{commentId},#{user},#{comment},#{up},#{down},#{posandtime},#{mobile},#{articleUrl})")    @Options(useGeneratedKeys = true,keyProperty = "id",keyColumn = "id")    public void addHotComment(HotComment hotComment);}

Controller

@Controller@RequestMapping("/ithome")public class HotCommentController {    @Autowired    private RedisService redisService;    @Autowired    private EsService esService;    /**     * 首页     * @return     */    @GetMapping("/index")    public ModelAndView index(){        ModelAndView mav = new ModelAndView("search");        Set<HotComment> set = redisService.rankGet("ithome:hotrank",50);        mav.addObject("comments",set);        return mav;    }    /**     * 搜索     * @param keyword     * @return     */    @GetMapping("/search/{keyword}")    public ModelAndView search(@PathVariable("keyword") String keyword){        ModelAndView mav = new ModelAndView("search");        List<HotComment> list = esService.findByUser(keyword);        mav.addObject("comments",list);        return mav;    }}

主程序

@SpringBootApplication@EnableElasticsearchRepositories@EnableSchedulingpublic class IthomecrawlerApplication {    public static void main(String[] args) {        SpringApplication.run(IthomecrawlerApplication.class, args);    }}

完整application.yml文件

#端口号server:  port: 8081spring:  data:  ##elasticsearch配置    elasticsearch:      cluster-name: elasticsearch      cluster-nodes: localhost:9300  ##redis配置  redis:    database: 0    host: localhost    port: 6379    password: redis    pool:      max-active: 15      max-wait: 1      max-idle: 0    timeout: 0  ##freemarker配置  freemarker:  ##是否允许属性覆盖    allow-request-override: false    allow-session-override: false    cache: true    check-template-location: true    content-type: text/html  ##暴露request属性    expose-request-attributes: false    expose-session-attributes: false    expose-spring-macro-helpers: false    suffix: .ftl    template-loader-path: classpath:/templates/    request-context-attribute: request    settings:      classic_compatible: true      locale: zh_CN      date_format: yyyy-MM-dd      time_format: HH:mm:ss      datetime_format: yyyy-MM-dd HH:mm:ss

完整pom.xml文件

<?xml version="1.0" encoding="UTF-8"?><project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">    <modelVersion>4.0.0</modelVersion>    <groupId>comcrazy</groupId>    <artifactId>ithomecrawler</artifactId>    <version>0.0.1-SNAPSHOT</version>    <packaging>jar</packaging>    <name>ithomecrawler</name>    <description>ITHome Crawler.</description>    <parent>        <groupId>org.springframework.boot</groupId>        <artifactId>spring-boot-starter-parent</artifactId>        <version>1.5.6.RELEASE</version>        <relativePath/> <!-- lookup parent from repository -->    </parent>    <properties>        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>        <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>        <java.version>1.8</java.version>    </properties>    <dependencies>        <dependency>            <groupId>org.springframework.boot</groupId>            <artifactId>spring-boot-starter-web</artifactId>        </dependency>        <!--es全文搜索-->        <dependency>            <groupId>org.springframework.boot</groupId>            <artifactId>spring-boot-starter-data-elasticsearch</artifactId>        </dependency>        <!-- redis -->        <dependency>            <groupId>org.springframework.boot</groupId>            <artifactId>spring-boot-starter-data-redis</artifactId>        </dependency>        <!-- mybatis -->        <dependency>            <groupId>org.mybatis.spring.boot</groupId>            <artifactId>mybatis-spring-boot-starter</artifactId>            <version>1.3.1</version>        </dependency>        <!-- mysql驱动 -->        <dependency>            <groupId>mysql</groupId>            <artifactId>mysql-connector-java</artifactId>            <scope>runtime</scope>        </dependency>        <!--数据源依赖-->        <dependency>            <groupId>com.alibaba</groupId>            <artifactId>druid</artifactId>            <version>1.0.24</version>        </dependency>        <!-- jsoup -->        <dependency>            <groupId>org.jsoup</groupId>            <artifactId>jsoup</artifactId>            <version>1.10.3</version>        </dependency>        <!-- freemarder -->        <dependency>            <groupId>org.springframework.boot</groupId>            <artifactId>spring-boot-starter-freemarker</artifactId>        </dependency>        <dependency>            <groupId>org.springframework.boot</groupId>            <artifactId>spring-boot-starter-test</artifactId>            <scope>test</scope>        </dependency>    </dependencies>    <build>        <finalName>ithomecrawler</finalName>        <plugins>            <plugin>                <groupId>org.springframework.boot</groupId>                <artifactId>spring-boot-maven-plugin</artifactId>            </plugin>        </plugins>    </build></project>

完整代码

原创粉丝点击