新手写网络爬虫之初体验
来源:互联网 发布:mac arp嗅探 编辑:程序博客网 时间:2024/06/05 19:05
根据上级安排,需要抓取某个网站的数据,用到了网络爬虫相关的知识,于是就去找了许多相关的资料和例子看。网上的例子很多,但都基本是对一些静态页面进行数据抓取。涉及到了模拟登录相关的例子则很少,即使有讲解的也不是很明了。而且写爬虫的框架有很多,最开始别人推荐我使用htmlutil,然后就用了,后来发现这个不是很好,至少我觉得在解析html页面的时候不太方便。后来就用jsoup来做,发现比前者方便很多。话不多说,看具体代码吧:
使用htmlutil:
public static void test(){HttpClient httpClient = new HttpClient(); GetMethod method = new GetMethod(sUrl); try { // 设置 HttpClient 接收 Cookie,用与浏览器一样的策略 httpClient.getParams().setCookiePolicy( CookiePolicy.BROWSER_COMPATIBILITY); httpClient.executeMethod(method); // 获得登陆后的 Cookie Cookie[] cookies = httpClient.getState().getCookies(); StringBuffer tmpcookies = new StringBuffer(); for (Cookie c :cookies) { tmpcookies.append(c.toString() ); } System.out.println("method,"+method.getResponseBodyAsString()); String dataUrl = "http://~:~b~";//需要访问的url GetMethod getMethod = new GetMethod(dataUrl); // // 每次访问需授权的网址时需带上前面的 cookie 作为通行证 getMethod.setRequestHeader("cookie", tmpcookies.toString()); // System.out.println("cookie,"+tmpcookies.toString()); method.setRequestHeader("Referer", "http://www.cc"); method.setRequestHeader("User-Agent", "www Spot"); httpClient.executeMethod(getMethod); // // 打印出返回数据,检验一下是否成功 String text = getMethod.getResponseBodyAsString(); // System.out.println(text); String queryUrl = "<span style="font-family: Arial, Helvetica, sans-serif;">http://~:~b~</span><span style="font-family: Arial, Helvetica, sans-serif;">/jmjkdaaction.do?path=findList&src=&jtdabzj=";</span> PostMethod postMethod = new PostMethod(queryUrl); NameValuePair[] data = { new NameValuePair("Input",""), new NameValuePair("UnitXzqh","130626201"), new NameValuePair("beginNl",""), new NameValuePair("birthEnd",""), new NameValuePair("birthStart",""), new NameValuePair("dabhSxJx","1"), new NameValuePair("dazt","0"), new NameValuePair("endNl",""), new NameValuePair("jdhs",""), new NameValuePair("jdsjJs",""), new NameValuePair("jdsjKs",""), new NameValuePair("jdys",""), new NameValuePair("jiedao","130626201"), new NameValuePair("jmdacxtj","xm"), new NameValuePair("jtzz",""), new NameValuePair("lrr",""), new NameValuePair("lrsjjs",""), new NameValuePair("lrsjks",""), new NameValuePair("radioOrCheck","radio"), new NameValuePair("rzzt",""), new NameValuePair("sex","0"), new NameValuePair("sqlFlag","jmjkda"), new NameValuePair("toPage",""), new NameValuePair("xiaoqu",""), new NameValuePair("xqSxJx","1"), new NameValuePair("xzqhf","130626201"), new NameValuePair("zdSxJx","1"), new NameValuePair("zxsjJs",""), new NameValuePair("zxsjKs","") }; postMethod.setRequestBody(data); httpClient.getParams().setCookiePolicy( CookiePolicy.BROWSER_COMPATIBILITY); httpClient.executeMethod(postMethod); System.out.println(postMethod.getResponseBodyAsString()); String html = postMethod.getResponseBodyAsString();//得到字符串类型的html页面 //TODO:解析。。。。。。。。。 } catch (Exception e) { e.printStackTrace(); } }
public static void main( String[] args ){ //登录请求Response res = null;try {res = Jsoup.connect("http://~/bdxzws/loginaction.do?"+ "loginname="+loginname+ "&password="+password+ "&loginFlag=y&url0=0&jsonpCallback=jQuery110204332349256146699_1450866101017"+ "&_="+randomTime).execute();System.out.println("200表示网址可以访问:" + res.statusCode());try {Long startTime = System.currentTimeMillis();jmjkda(res);Long endTime = System.currentTimeMillis();} catch (Exception e) {e.printStackTrace();}
public static void jmjkda(Response res) throws Exception{//查询请求(1.注意加cookie;2.POST请求3.超时时间长一点儿)Connection con = Jsoup.connect("http://~/bdxzws/query/zdycx.do?option=showResult").method(Method.POST).cookie("JSESSIONID", res.cookie("JSESSIONID")).timeout(TIMEOUT);//查询表单数据。根据实际情况修改con.data("option","showResult").data("cxdx","jmjkdaview").data("tables","jmjkdaview").data("stattitle","").data("statlisttitle","").data("fieldstitle","居民健康档案编号,居民档案编号,姓名,姓名简拼,身份证号,").data("termlist","").data("showfields","BH,JMDABH,XM,XMJP,SFZH").data("statfields","").data("statlist","").data("sorts","").data("orderfields","BH").data("_target_pageNum","1").data("_page_state","first");//执行请求Response res1 = con.execute();//第一个查询只是为了获取总条数Document s = res1.parse();Elements input = s.body().getElementsByIndexEquals(4);Element e = input.get(17);Integer totalNum = Integer.valueOf(e.attr("value"));//总条数Integer totalPages = totalNum % PAGESIZE == 0 ? totalNum / PAGESIZE: totalNum / PAGESIZE +1;//总页数System.out.println("获取数据的总条数为:"+totalNum+",总页数为:"+totalPages);Map<String ,String> dataMap = new HashMap<String,String>();dataMap.put("cxdx","jmjkdaview");dataMap.put("tables","jmjkdaview");dataMap.put("stattitle","");dataMap.put("statlisttitle","");dataMap.put("fieldstitle","居民健康档案编号,居民档案编号,姓名,姓名简拼,身份证号,");dataMap.put("termlist","");String showfields = "BH,JMDABH,JDRQ,SSFWJG,SSXZQH,XM,XMJP,SFZH,XB,CSRQ,XX,JTZZ,LXDH,XL,ZY,YLFFFS,HYZK,JDYS,JDHS,SG,TZ,TW,YW"+",SSY,SZY,SFXY,XYSL,SFYJ,YJSL,YSXG,SMQK,STHDSP,YYAH,YWGM,QTGMYW,SWGM,WSWGM,CMGM,HFGM,YHZGX,RQFL,DAZT,ZXYY"+",ZXSJ,LRR,LRSJ,SMXS,GXYQZSJ,TNBQZSJ,GXBQZSJ,ZLQZSJ,TSXX,MZ,GJ,DALB,QTDZLB,QTDZ,BLS,BLSBZ,NCZQZSJ,COPDQZSJ"+",JHBQZSJ,JSFLZQZSJ,GYQZSJ,QTJWS,SSJWS,WSJWS,SXJWS,QTJWSQZSJ,YCBSMC,CJ,CJZH,FQHMBQK,FQHQTBQK,MQHMBQK,MQHQTBQK"+",XDJMHMBQK,XDJMHQTBQK,ZNHMBQK,ZNHQTBQK,QTJZS,JMJKDACARDNUM,BEIZHU,GZDW,LXRXM,LXRDH,EMAIL,QTYLFFFS,QTCJ";dataMap.put("showfields",showfields);dataMap.put("statlist","");dataMap.put("sorts","");dataMap.put("orderfields","BH");dataMap.put("_page_state","first");dataMap.put("_pagination_sql_back","select "+dataMap.get("showfields")+" from jmjkdaview where 1=1 and ( jmjkdaview.lrdw='1F000000000000008126')");dataMap.put("option","showResult");File file = new File(FILEPATH+"居民健康档案.txt");if(file.exists()) {file.delete();}try(BufferedWriter writer = new BufferedWriter(new FileWriter(file,true))){writer.write("title");//根据总条数,进行分页查询获取所有记录int sum = 0;for(int i = 1 ; i <= totalPages ; i ++){ //查询表单数据。根据实际情况修改Connection con2 = Jsoup.connect("http://~/bdxzws/query/zdycx.do?option=showResult").method(Method.POST).cookie("JSESSIONID", res.cookie("JSESSIONID")).timeout(TIMEOUT);dataMap.put("_target_pageNum",String.valueOf(i));con2.data(dataMap);//执行请求Response res2 = con2.execute();//将结果写入本地文件(1.每条数据一行;2.列分隔符为“|”;3.合理组织输出文件名)Document s2 = res2.parse();Elements tables = s2.body().getElementsByClass("tableface");for (Element table : tables){for (Element tr : table.getElementsByTag("tr")){for (Element td : tr.getElementsByTag("td")){writer.write(td.text() + "|");}writer.write("\n");writer.flush();}}sum += PAGESIZE;System.out.println("已抓取了"+ sum +"条数据");}}System.out.println("获取居民健康档案--操作成功。");}
新手勿喷,有错误的地方请指出。。。
1 0
- 新手写网络爬虫之初体验
- 爬取链家房屋信息-网络爬虫初体验demo
- Python]新手写爬虫全过程
- 写网络爬虫初探
- 自己动手写网络爬虫
- 自己动手写网络爬虫
- 自己动手写网络爬虫
- 学习《自己动手写网络爬虫》之记录1
- 学习《自己动手写网络爬虫》之记录2
- 产品经理新手如何写体验报告?
- web 爬虫初体验
- 【Python网络爬虫 】新手实践笔记--urllib2
- Unity4.3之2D初体验&新手教学
- 第一次写python--网络爬虫
- 写一网络爬虫有感
- 自己动手写网络爬虫1
- 用python写网络爬虫
- WebMagic写的网络爬虫
- Linux随笔一之一键配置LNMP
- 解决Android应用安装快完毕时提示签名冲突
- Java和js时间格式化
- 非分区表迁移到分区表
- Modbus通讯协议学习 - 认识篇
- 新手写网络爬虫之初体验
- Android 广播
- Python初学
- 对抗不可执行告警的四种措施
- iOS9搜索三剑客之CoreSpotlight
- 【LEETCODE】116-Populating Next Right Pointers in Each Node
- 【OMNet++】OMNet++初学建议
- vxlan
- Android小技巧:改变toast位置