OSCHINA博文抄袭检查

来源:互联网 发布:csol商人 网络去尼玛 编辑:程序博客网 时间:2024/05/29 16:58
/* * * * * * * APDPlat - Application Product Development Platform * * Copyright (c)  * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * */   package org.seo.rank.impl;   import org.apache.commons.lang.StringUtilsimport org.jsoup.Connectionimport org.jsoup.Jsoupimport org.jsoup.nodes.Documentimport org.jsoup.nodes.Elementimport org.jsoup.select.Elementsimport org.seo.rank.CopyCheckerimport org.seo.rank.list.DynamicIpimport org.seo.rank.list.UrlToolsimport org.seo.rank.list.impl.DefaultParserimport org.seo.rank.model.Articleimport org.slf4j.Loggerimport org.slf4j.LoggerFactory;   import java.io.UnsupportedEncodingExceptionimport java.net.URLimport java.net.URLEncoderimport java.util.*import java.util.concurrent.atomic.AtomicIntegerimport java.util.stream.Collectors;   /** * 检查文章抄袭情况 * @author 杨尚川 */ public class BaiduCopyCheckerimplements CopyCheckerprivate static final Logger LOGGER = LoggerFactory.getLogger(BaiduCopyChecker.class); private static final String ACCEPT = "text/html, */*; q=0.01"private static final String ENCODING = "gzip, deflate"private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3"private static final String CONNECTION = "keep-alive"private static final String HOST = "www.baidu.com"private static final String REFERER = "http://www.baidu.com"private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:31.0) Gecko/20100101 Firefox/31.0";   //获取多少页 private static final int PAGE = 15private static final int PAGESIZE = 10;     @Override public Map<Article, Set<String>>check(List<Article>articles) { Map<Article,Set<String>> data= new HashMap<>(); articles.forEach(article -> { data.put(article, doCheck(article)); }); return data; }   public Set<String> doCheck(Article article){ Set<String> data= new HashSet<>(); if(StringUtils.isBlank(article.getTitle()) || StringUtils.isBlank(article.getUrl())){ return data; } String query = nulltry { query = URLEncoder.encode(article.getTitle(),"UTF-8"); } catch (UnsupportedEncodingException e) { LOGGER.error("url构造失败", e); return data; } if(StringUtils.isBlank(query)){ return data; } for (int i= 0; i < PAGE; i++) { String url = "http://www.baidu.com/s?tn=monline_5_dg&ie=utf-8&wd="+ query+"&oq="+query+"&usm=3&f=8&bs="+query+"&rsv_bp=1&rsv_sug3=1&rsv_sug4=141&rsv_sug1=1&rsv_sug=1&pn="+ i * PAGESIZELOGGER.debug(url); data.addAll(doCheck(url, article)); } return data; }   private Set<String> doCheck(String url, Article article) { Set<String> data= new HashSet<>(); tryDocument document = Jsoup.connect(url) .header("Accept",ACCEPT) .header("Accept-Encoding",ENCODING) .header("Accept-Language",LANGUAGE) .header("Connection",CONNECTION) .header("Host",HOST) .header("Referer",REFERER) .header("User-Agent",USER_AGENT) .get(); String titleCssQuery = "html body div div div div div h3.t a"Elements elements = document.select(titleCssQuery); int i=0for(Element element: elements){ String _title = element.text(); if(StringUtils.isBlank(_title)){ continue; } i++LOGGER.debug(i+":"+_title); if(_title.contains("百度翻译"|| !contains(_title, article.getTitle())){ LOGGER.debug("搜索结果检查通过"); continue; } String href = element.attr("href"); href = UrlTools.normalizeUrl(url, href); String realUrl = urlConvert(href); LOGGER.debug("url:"+url); LOGGER.debug("realUrl:"+realUrl); String[] target = new URL(realUrl).getHost().split("\\."); String[] source = new URL(article.getUrl()).getHost().split("\\."); if(target.length>1 && source.length>1 && !(target[target.length-2]+target[target.length-1]).equals(source[source.length-2]+source[source.length-1])) { data.add(realUrl); } } } catch (Exception ex) { LOGGER.error("搜索出错",ex); } return data; } /** * 判断title2是否包含title1,去除标题中的特殊字符 * @param title2 * @param title1 * @return */ private static boolean contains(String title2, String title1){ StringBuilder str2 = new StringBuilder(); StringBuilder str1 = new StringBuilder(); for(char c: title2.toCharArray()){ if(Character.isLetter(c)){ str2.append(c); } } for(char c: title1.toCharArray()){ if(Character.isLetter(c)){ str1.append(c); } } LOGGER.debug("转换标题前:"+title2); LOGGER.debug("转换标题后:"+str2.toString()); LOGGER.debug("转换标题前:"+title1); LOGGER.debug("转换标题后:"+str1.toString()); if(str2.toString().contains(str1.toString())){ LOGGER.debug(title2+" 【包含】 "+title1); return true; } LOGGER.debug(title2+" 【不包含】 "+title1); return false; } /** * 将百度的链接转换为网页的链接 * @param url 百度链接 * @return 网页链接 */ private static String urlConvert(String url){ tryif(!url.startsWith("http://www.baidu.com/link?url=")){ //不需要转换URL return url; } LOGGER.debug("转换前的URL:"+url); Connection.Response response= getResponse(url); //这里要处理爬虫限制 if(response==null|| response.body().contains("请您点击按钮解除封锁"|| response.body().contains("请输入以下验证码")){ //使用新的IP地址 DynamicIp.toNewIp(); response = getResponse(url); } String realUrl = response.header("Location"); LOGGER.debug("转换后的URL:"+realUrl); //检查网页是否被重定向 //这个检查会导致速度有点慢 //这个检测基本没有必要,除非是那种极其特殊的网站,ITEYE曾经就是,后来在我的建议下改进了 /* LOGGER.debug("检查是否有重定向:"+realUrl); Connection.Response response = getResponse(realUrl); //这里要处理爬虫限制 if(response==null || response.body().contains("请您点击按钮解除封锁") || response.body().contains("请输入以下验证码")){ //使用新的IP地址 DynamicIp.toNewIp(); response = getResponse(realUrl); } String realUrl2 = response.header("Location"); if(!StringUtils.isBlank(realUrl2)){ LOGGER.debug("检查到重定向到:"+realUrl2); return realUrl2; } */ return realUrl; }catch(Exception e){ LOGGER.error("URL转换异常", e); } return url; } private static Connection.ResponsegetResponse(Stringurl) { tryConnection.Response response= Jsoup.connect(url) .header("Accept",ACCEPT) .header("Accept-Encoding",ENCODING) .header("Accept-Language",LANGUAGE) .header("Connection",CONNECTION) .header("Host",HOST) .header("Referer",REFERER) .header("User-Agent",USER_AGENT) .ignoreContentType(true) .timeout(30000) .followRedirects(false) .execute(); return response; } catch (Exception e){ LOGGER.debug("获取页面失败:", e); } return null; } public static void main(String[] args){ CopyChecker copyChecker = new BaiduCopyChecker(); //计算OSCHINA博文被抄袭的情况 //List<Article> articles = DefaultParser.oschinaBlog(); //计算ITEYE博文被抄袭的情况 List<Article> articles= DefaultParser.iteyeBlog(); //这里排除不统计的博文 articles=articles.stream().filter(article-> !(article.getTitle().contains("idioms"|| article.getTitle().contains("分布式内存文件系统:Tachyon"|| article.getTitle().contains("Nutch视频"|| article.getTitle().contains("如何解决BUG?"|| article.getTitle().contains("采集电子报纸"|| article.getTitle().contains("汉英双语的差异"|| article.getTitle().contains("分布式搜索算法"|| article.getTitle().contains("The Future of Compass & ElasticSearch"|| article.getTitle().contains("1208个合成词"|| article.getTitle().contains("Java远程调试"|| article.getTitle().contains("What a Wonderful Code"|| article.getTitle().contains("代码评审脚本"|| article.getTitle().contains("Linux Netcat command – The swiss army knife of net"|| article.getTitle().contains("common prefix different suffix")) ).collect(Collectors.toList()); //检查 Map<Article,Set<String>> result= copyChecker.check(articles); //输出检查报告 LOGGER.info("<h4>检查博文数目:"+ articles.size()+"</h4>"); AtomicInteger i = new AtomicInteger(); result.entrySet().stream().sorted((a,b)->b.getValue().size()-a.getValue().size()).forEach(e->String query = nulltry { query = URLEncoder.encode(e.getKey().getTitle(),"UTF-8"); } catch (UnsupportedEncodingException ex) { LOGGER.error("url构造失败", ex); return; } String originURL = e.getKey().getUrl(); if(e.getValue().size()>0) { LOGGER.info("<h4>"+i.incrementAndGet()+"、<a target=\"_blank\" href=\"http://www.baidu.com/s?wd="+ query + "\">"+ e.getKey().getTitle()+ "</a> 抄袭链接有("+e.getValue().size()+")个</h4>"); LOGGER.info("原文链接:<a target=\"_blank\" href=\""+ originURL + "\">"+ originURL + "</a><br/>"); LOGGER.info("抄袭链接:"); LOGGER.info("<ol>"); e.getValue().stream().sorted().forEach(url->LOGGER.info("<li><a target=\"_blank\" href=\""+ url + "\">"+ url + "</a></li>")); LOGGER.info("</ol>"); }elseLOGGER.info(i.incrementAndGet()+"、<a target=\"_blank\" href=\"http://www.baidu.com/s?wd="+ query + "\">"+ e.getKey().getTitle()+ "</a><br/>"); LOGGER.info("原文链接:<a target=\"_blank\" href=\""+ originURL + "\">"+ originURL + "</a> 无抄袭链接<br/>"); } }); } }
0 0
原创粉丝点击