【续】自己写的实时爬取 CSDN 2012 博客之星 88位候选人排名

来源:互联网 发布:开通淘宝要下载什么 编辑:程序博客网 时间:2024/05/23 18:37

本篇博客撰写说明:

①时代在变,楼主的需求也发生了一丁点的更新,从入围《CSDN 2012 博客之星》评选,楼主幸运挤进前20名。但是与第10名票数还有一定的差距,故更新程序,查看楼主自己与第十名的 票数差距、排名差距

②有CSDN的朋友对前几天楼主写的《自己写的实时爬取 CSDN 2012 博客之星 88位候选人排名》程序有点兴趣,故将程序 进行优化和重构,并加入了相应的注释,使程序更加具有可读性。

末:由于楼主能力有限,原先发现的该程序爬行88个网页速度过慢,主因系:网速原因,故不再优化。楼主也发现解析各个网页中【用户名、票数、排名】部分有很大的优化空间,如感兴趣的网友,请提供解析部分的优化方案,共同学习哦,亲!

 

如果觉得我的技术文章还有点让列为看官汲取之处,

请给我投上宝贵的一篇,以兹鼓励呵,多谢,多谢!!

本人ID:m13666368773

投票地址:http://vote.blog.csdn.net/item/blogstar/m13666368773

凡投票的朋友,

请第一时间在文章下方评论:“当前票数:XXX+已投票+邮箱:XXX@XXX.com

稍后会将 <Web应用界面设计规范>PPT版本,发给您。

该博客地址:http://blog.csdn.net/m13666368773/article/details/8276810

请稍花点时间,为我投上您手中宝贵的一票,

敬告:我这能看到您的投票“用户名”,请勿虚报!多谢,多谢!!

截至时间:2012-12-30

 

废话不多说:上代码

package com.aptech;import java.io.BufferedReader;import java.io.InputStream;import java.io.InputStreamReader;import java.io.OutputStreamWriter;import java.net.HttpURLConnection;import java.net.URL;import java.text.SimpleDateFormat;import java.util.ArrayList;import java.util.Date;import java.util.HashMap;import java.util.List;import java.util.Map;@SuppressWarnings("unchecked")public class TestPachongUrl {private static Map messageMap = new HashMap();private static List list = new ArrayList();private static String url = "http://vote.blog.csdn.net/item/blogstar/";//抽取公共Url部分/* * 以下user[],手工录入2012年88位CSDN博客之星候选人 */private static String user[] = new String[] { "Testing_is_believing", "t0nsha", "iukey", "yjflinchong", "taomanman", "chinafe", "hliq5399", "dog250", "qinjuning", "cheny_com", "v_JULY_v","zhmxy555", "Purpleendurer", "iihero", "yming0221", "ccanan", "tigerjb", "cheungmine", "hawksoft", "sheismylife", "hfahe", "cyq1984", "littletigerat", "kmyhy", "caimouse", "manoel","xyz_lmn", "hunkcai", "yiyaaixuexi", "norains", "clever101", "leftfist", "xiaominghimi", "niyi0318", "yanghuiliu", "abandonship", "mapdigit", "bill_man", "Augusdi", "LoveLion","sunboy_2050", "kongxx", "21aspnet", "chszs", "thl789", "mylxiaoyi", "akof1314", "yincheng01", "keyboardOTA", "pan_tian", "downmoon", "wangkuifeng0118", "robinson_0612", "bluishglc","coolbacon", "tangcheng_ok", "tianxiaode", "cjjky", "MoreWindows", "mr_raptor", "dojotoolkit", "chelsea", "chgaowei", "teamlet", "IBM_hoojo", "iefreer", "lee576", "jaminwm", "xuhuojun","linghe301", "caolaosanahnu", "ricohzhanglong", "totogo2010", "axman", "ce123", "rabbit729", "nkmnkm", "superdont", "m13666368773", "aomandeshangxiao", "hitlion2008", "siren0203","feixiaoxing", "Poechant", "cloudhsu", "Innost", "yanghua_kobe", "tianlesoftware" };private static final String master = "m13666368773";// 楼主用户名,[关键值],用于从集合中获取楼主信息,包括用户名、当前票数、当前排名private static final String tenthUser = "10";// 第十名,[关键值],用户从集合中获取第十名用户的信息,包括用户名、当前票数、当前排名private static String saveMasterMessage = null;// 初始化,用于保存楼主信息private static String saveTenthUserMessage = null;// 初始化,用于保存第十名用户的信息/** * 该方法用于爬取88名候选人投票主页,并记录信息:用户名、当前票数、当前排名 * @param url */public static String test(URL url) throws Exception {/** * 首先要和URL下的URLConnection对话。 URLConnection可以很容易的从URL得到。比如: // Using * java.net.URL and //java.net.URLConnection */HttpURLConnection connection = (HttpURLConnection) url.openConnection();/** * 然后把连接设为输出模式。URLConnection通常作为输入来使用,比如下载一个Web页。 * 通过把URLConnection设为输出,你可以把数据向你个Web页传送。下面是如何做: */connection.setDoOutput(true);connection.setRequestMethod("POST");connection.setRequestProperty("user-agent", "mozilla/4.7 [en] (win98; i)");connection.connect();/** * 最后,为了得到OutputStream,简单起见,把它约束在Writer并且放入POST信息中,例如: ... */OutputStreamWriter out = new OutputStreamWriter(connection.getOutputStream(), "UTF-8");out.flush();out.close();/** * 这样就可以发送一个看起来象这样的POST: POST /jobsearch/jobsearch.cgi HTTP 1.0 ACCEPT: * text/plain Content-type: application/x-www-form-urlencoded * Content-length: 99 username=bob password=someword */// 一旦发送成功,用以下方法就可以得到服务器的回应:String sCurrentLine = "";String sTotalString = "";InputStream l_urlStream;l_urlStream = connection.getInputStream();// 传说中的三层包装阿!BufferedReader l_reader = new BufferedReader(new InputStreamReader(l_urlStream));while ((sCurrentLine = l_reader.readLine()) != null) {sTotalString += sCurrentLine + "\r\n";}int begin0 = sTotalString.indexOf("博客地址:<a href=\"http://blog.csdn.net/");int end0 = sTotalString.indexOf("\" class=\"red\" target=\"_blank\">");int begin1 = sTotalString.indexOf("票数:<span class=\"red\">");int end1 = sTotalString.indexOf("</span> 票</li>");int begin2 = sTotalString.indexOf("当前排名:<span class=\"red\">");int end2 = sTotalString.indexOf("</span> 名</li>");String message = sTotalString.substring(begin0 + 35, end0) + "-" + sTotalString.substring(begin1 + 21, end1) + "=" + sTotalString.substring(begin2 + 23, end2);return message;}/** * 给用户名补充空格,用于显示对齐 * @param user */public static String addBlank(String user) {String blank = " ";int userLength = user.length();for (int i = 0; i < 30 - userLength; i++) {user += blank;}return user;}/** * 给表头补充空格,用于显示对齐 * @param message */public static String addChinaBlank(String message) {String blank = " ";int userLength = message.length() * 2;for (int i = 0; i < 70 - userLength; i++) {message += blank;}return message;}/** * 输入 一条用户信息,通过本方法,分别解析出 用户名、当前票数、当前排名,并做对齐处理,返回 * @param message */public static String getRankMessage(String message) {return addBlank(message.substring(0, message.indexOf("-"))) + message.substring(message.indexOf("-") + 1, message.indexOf("=")) + "   "+ message.substring(message.indexOf("=") + 1, message.length());}/** * 主方法,运行一下喽 */public static void main(String[] args) throws Exception {for (int i = 0; i < user.length; i++) {list.add(new URL(url + user[i]));}SimpleDateFormat dateformat = new SimpleDateFormat("yyyy年MM月dd日 HH时mm分ss秒 E ");String nowTime = dateformat.format(new Date());System.out.println("统计时间:" + nowTime);System.out.println("候选人数量:" + user.length);System.out.println(addChinaBlank("用户名") + addChinaBlank("票数") + "排名");for (int i = 0; i < list.size(); i++) {String subMessage = test((URL) list.get(i));String key = subMessage.substring(subMessage.indexOf("=") + 1, subMessage.length());messageMap.put(key, subMessage);}for (int i = 1; i <= 88; i++) {String endMessage = messageMap.get("" + i).toString();System.out.println(getRankMessage(endMessage));if (master.equals(endMessage.substring(0, endMessage.indexOf("-")))) {// 保存楼主信息saveMasterMessage = endMessage;}if (tenthUser.equals(endMessage.substring(endMessage.indexOf("=") + 1, endMessage.length()))) {// 保存第十名用户的信息saveTenthUserMessage = endMessage;}}int tenthUserPiaoshu = Integer.parseInt(saveTenthUserMessage.substring(saveTenthUserMessage.indexOf("-") + 1, saveTenthUserMessage.indexOf("=")));int masterPiaoshu = Integer.parseInt(saveMasterMessage.substring(saveMasterMessage.indexOf("-") + 1, saveMasterMessage.indexOf("=")));int piaoshuGap = tenthUserPiaoshu - masterPiaoshu;// 楼主与第十名相差的票数int tenthUserPaiming = Integer.parseInt(saveTenthUserMessage.substring(saveTenthUserMessage.indexOf("=") + 1, saveTenthUserMessage.length()));int masterPaiming = Integer.parseInt(saveMasterMessage.substring(saveMasterMessage.indexOf("=") + 1, saveMasterMessage.length()));int paimingGap = ~(tenthUserPaiming - masterPaiming) + 1;// 楼主与第十名相差的名数System.out.println("=============以下对比楼主与第十名用户的信息===============================");System.out.println(getRankMessage(saveTenthUserMessage));System.out.println(getRankMessage(saveMasterMessage));System.out.println("========================================================================");System.out.println(addBlank("difference tenthUer VS master") + piaoshuGap + "   " + paimingGap);}}


运行一下:

统计时间:2012年12月19日 17时16分34秒 星期三 候选人数量:88用户名                             票数                           排名v_JULY_v                      1347   1MoreWindows                   583   2yiyaaixuexi                   476   3mr_raptor                     435   4xiaominghimi                  410   5yincheng01                    395   6zhmxy555                      391   7yming0221                     379   8Poechant                      358   9ricohzhanglong                346   10LoveLion                      322   11tianlesoftware                286   12taomanman                     282   13m13666368773                  217   14aomandeshangxiao              216   15cheny_com                     176   16linghe301                     160   17dojotoolkit                   149   18hawksoft                      141   19cjjky                         123   20akof1314                      122   21nkmnkm                        120   22clever101                     116   23yanghuiliu                    103   24cyq1984                       103   25niyi0318                      101   26sheismylife                   96   27cloudhsu                      87   28coolbacon                     76   29Testing_is_believing          71   30cheungmine                    56   31bill_man                      55   32tangcheng_ok                  55   3321aspnet                      53   34lee576                        53   35norains                       51   36teamlet                       50   37manoel                        48   38hfahe                         48   39sunboy_2050                   47   40yjflinchong                   47   41tigerjb                       43   42mapdigit                      43   43axman                         42   44Augusdi                       39   45pan_tian                      39   46feixiaoxing                   38   47mylxiaoyi                     37   48t0nsha                        35   49thl789                        35   50qinjuning                     35   51kongxx                        34   52caimouse                      32   53chgaowei                      32   54dog250                        31   55ce123                         31   56downmoon                      30   57xyz_lmn                       29   58littletigerat                 28   59robinson_0612                 28   60iihero                        28   61siren0203                     28   62Purpleendurer                 28   63iukey                         27   64tianxiaode                    27   65abandonship                   27   66Innost                        27   67wangkuifeng0118               26   68iefreer                       26   69caolaosanahnu                 26   70hunkcai                       25   71chelsea                       25   72totogo2010                    24   73leftfist                      24   74IBM_hoojo                     24   75hitlion2008                   24   76jaminwm                       23   77rabbit729                     23   78yanghua_kobe                  23   79keyboardOTA                   22   80ccanan                        20   81hliq5399                      20   82kmyhy                         20   83superdont                     19   84xuhuojun                      19   85chszs                         18   86chinafe                       17   87bluishglc                     14   88=============以下对比楼主与第十名用户的信息===============================ricohzhanglong                346   10m13666368773                  217   14========================================================================difference tenthUer VS master 129   4


 

 

 

原创粉丝点击