爬虫
来源:互联网 发布:如何把json解析出来 编辑:程序博客网 时间:2024/05/18 22:45
package demo;import com.google.common.base.Joiner;import com.mongodb.BasicDBObject;import com.mongodb.DB;import com.mongodb.DBCollection;import com.mongodb.MongoClient;import org.jsoup.Connection;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;import redis.clients.jedis.Jedis;import java.io.IOException;import java.util.ArrayList;import java.util.List;import javax.swing.plaf.synth.SynthSpinnerUI;public class RedisSpider2 { private static MongoClient mongo = new MongoClient( "localhost" , 27017 ); private static Jedis jedis=new Jedis("localhost"); /* private static Queue celebrate_queue=new Queue();//大V队列 private static void InputSeedIntoQueue(){ celebrate_queue.enQueue("1782270602,286"); celebrate_queue.enQueue("yiwenmu,575"); } */ public static String getSeedFromRedis() { // 从Redis中获取数据并出队,uid以及connum,取出后处理分离,取出后即刻销毁 String element = jedis.lpop("follower:uid_connum"); // System.out.println("[DEBUG] : 当前使用了: " + element); System.out.println("[DEBUG] : 现出队 -------------------- " + element); return element; } public static void writeDataIntoMongo(BasicDBObject document){ MongoClient mongo; mongo = new MongoClient( "localhost" , 27017 ); DB db = mongo.getDB("Belle"); //得到数据库 DBCollection table = db.getCollection("Following"); //拿到table table.save(document); } public static void main(String[] args) throws IOException { //InputSeedIntoQueue(); System.out.println("成功读取种子文件"); while(true){ String uid_fonum=celebrate_queue.deQueue();//读种子用户 System.out.println("[DEBUG] 大V "+uid_fonum+" 已出队列"); String[] parts = uid_fonum.split(",");//以逗号分隔开 存入parts字符串数组中 String uid = parts[0]; //分隔的第一个部分就是uid int following = Integer.parseInt(parts[1]); //把字符串转转换成整型 System.out.println("[DEBUG]----开始爬取-----"+uid_fonum+"-------"); crawler_and_toMongo(uid,following); //种子用户开始爬 } } private static void crawler_and_toMongo(String uid, int following) { float total_number = (float) (following*1.0/32 +3); //一页32个关注用户 ArrayList<String> certain_celebrate_list = new ArrayList(); for(int i=1;i<total_number;i++){ System.out.println("The current uid is "+uid+" and the page number is "+i); String following_url = "http://tw.weibo.com/"+uid+"/follow/p/"+i; Connection con = Jsoup.connect("http://tw.weibo.com/api/user/follow");//和url建立连接 con.data("uid",uid); con.data("page",String.valueOf(i)); con.data("currentuid","1991953413"); con.data("page_size","32"); //设了一个cookie 以新浪用户登录的 con.header("Cookie","SINAGLOBAL=9711952802808.682.1499668084041; __gads=ID=130480cbf75a5ad6:T=1500258878:S=ALNI_MZRcFnJveJoxtD4TBbJNbD__KcD-A; UM_distinctid=15d4e658b271f5-0573ee5fbac68c-1b1d7751-1fa400-15d4e658b28944; wvr=6; UOR=,,www.shejidaren.com; _ga=GA1.2.84290483.1500258858; _gid=GA1.2.952496907.1501988163; crtg_rta=; SSOLoginState=1502013392; SCF=Agl4g4wcoIanRFGqnVTCrYVRKcGcCSGCXik1olhn6h9oPJpHfvjbBZyqPuHU-ibuQXyHNclynoHBANDkf_WRtWc.; SUB=_2A250gpeCDeRhGedH4lMY9S3Nzz-IHXVX-Y5KrDV8PUNbmtBeLUWjkW9XZkSwKQAYTJkZTvK42M-ekE_Hnw..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWEbrh.i5bvbnSvvbQHi2M35JpX5KMhUgL.Fo241K24SKepShe2dJLoIpjLxKML1KBLBo5LxK-LB--LB-2LxK-L1h-L1h2t; SUHB=0LcjYQm8zbqhfQ; ALF=1533549388; _s_tentry=-; Apache=9574205823039.291.1502013327944; ULV=1502013327950:6:1:2:9574205823039.291.1502013327944:1501421673576"); con.header("Referer",following_url); //关注 org.jsoup.nodes.Document doc; try { doc = con.post(); for (Element e: doc.getElementsByClass("fwBox") ){ String following_url_redirect = e.attr("href"); System.out.println(following_url_redirect); /////////////////////////去找每一个关注的人的粉丝数 判断其是不是大V Connection con_direct = Jsoup.connect(following_url_redirect);//和url建立连接 con_direct.data("uid",uid); con_direct.userAgent("Mozilla"); con_direct.data("currentuid","1991953413"); con_direct.data("page_size","32"); //设了一个cookie 以新浪用户登录的 con_direct.header("Cookie","SINAGLOBAL=9711952802808.682.1499668084041; __gads=ID=130480cbf75a5ad6:T=1500258878:S=ALNI_MZRcFnJveJoxtD4TBbJNbD__KcD-A; UM_distinctid=15d4e658b271f5-0573ee5fbac68c-1b1d7751-1fa400-15d4e658b28944; wvr=6; UOR=,,www.shejidaren.com; _ga=GA1.2.84290483.1500258858; _gid=GA1.2.952496907.1501988163; crtg_rta=; SSOLoginState=1502013392; SCF=Agl4g4wcoIanRFGqnVTCrYVRKcGcCSGCXik1olhn6h9oPJpHfvjbBZyqPuHU-ibuQXyHNclynoHBANDkf_WRtWc.; SUB=_2A250gpeCDeRhGedH4lMY9S3Nzz-IHXVX-Y5KrDV8PUNbmtBeLUWjkW9XZkSwKQAYTJkZTvK42M-ekE_Hnw..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWEbrh.i5bvbnSvvbQHi2M35JpX5KMhUgL.Fo241K24SKepShe2dJLoIpjLxKML1KBLBo5LxK-LB--LB-2LxK-L1h-L1h2t; SUHB=0LcjYQm8zbqhfQ; ALF=1533549388; _s_tentry=-; Apache=9574205823039.291.1502013327944; ULV=1502013327950:6:1:2:9574205823039.291.1502013327944:1501421673576");// con_direct.header("Referer","http://tw.weibo.com/libingbing/follow/p/12");//从李冰冰的微博关注名单的12页开始发送请求 con_direct.header("Referer",following_url_redirect); //关注 org.jsoup.nodes.Document doc_direct; try{ doc_direct=con_direct.get(); String celebrant_following; //关注人数 String following_celebrant_uid;//大Vuid for (Element e1: doc_direct.select("#mInfo").select("ul").select("li.fansNum").select("a").select("strong")){ String follower_num = e1.text();//抓取粉丝数 if(Long.parseLong(follower_num)>1000000)//粉丝数如果大于100万 则为大V { //从url里截取uid following_celebrant_uid=following_url_redirect.substring(following_url_redirect.lastIndexOf("/")+1,following_url_redirect.length()); //爬取去该大V的关注数 Elements e2=doc_direct.select("#mInfo").select("ul").select("li.followNum").select("a").select("strong"); celebrant_following=e2.text(); //将新爬取的大V的uid和其关注数 连接成新的字符串 String new_uid_fonum=following_celebrant_uid+","+celebrant_following; //将新爬取的大V的uid和其关注数 入队列 celebrate_queue.enQueue(new_uid_fonum); System.out.println("[DEBUG]"+following_celebrant_uid+" 是大V且已经入队"); certain_celebrate_list.add(following_celebrant_uid); } } String uids = Joiner.on(",").join(certain_celebrate_list); BasicDBObject document = new BasicDBObject(); document.put("_id", uid); //该用户的uid document.put("following_celebrant", certain_celebrate_list); writeDataIntoMongo(document); //System.out.println("[DEBUG]"+following_celebrant_uid+"以入mongo"); timeDelay(1,3);//最小为5 最大为15 单线程 } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } } }catch (IOException e1) { // TODO Auto-generated catch block //e1.printStackTrace(); timeDelay(30,35); continue; } } } public static void timeDelay(float min, float max){ int random = (int)(max * Math.random() + min); try { Thread.sleep(random * 1000);//线程睡觉 } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); }}}
阅读全文
0 0
- 爬虫
- 爬虫
- 爬虫
- 爬虫
- 爬虫
- 爬虫
- 爬虫
- 爬虫
- 爬虫
- 爬虫
- 爬虫
- 爬虫
- 爬虫
- 爬虫
- 爬虫
- 爬虫
- 爬虫
- 爬虫
- java 图片上传的处理,获取图片大小,宽高。
- http缓存
- 通过添加路由的方式解决ping不通网络的问题
- Java爬虫-webmagic
- AK黄成佳|成为优秀的企业家,真的有你想的这么简单吗?
- 爬虫
- 流式布局
- JS添加事件和解绑事件:addEventListener()与removeEventListener()
- Java对象垃圾回收
- 【Spring+SpringMVC+MyBatis深入学习及搭建】08.MyBatis查询缓存
- Java中的char类型
- AI真的会杀人?DeepMind开发了二维网格游戏来做测试
- webview 加载https出现的问题
- Linux下安装Tomcat