爬虫

来源:互联网 发布:如何把json解析出来 编辑:程序博客网 时间:2024/05/18 22:45
package demo;import com.google.common.base.Joiner;import com.mongodb.BasicDBObject;import com.mongodb.DB;import com.mongodb.DBCollection;import com.mongodb.MongoClient;import org.jsoup.Connection;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;import redis.clients.jedis.Jedis;import java.io.IOException;import java.util.ArrayList;import java.util.List;import javax.swing.plaf.synth.SynthSpinnerUI;public class RedisSpider2 {    private static MongoClient mongo = new MongoClient( "localhost" , 27017 );    private static Jedis jedis=new Jedis("localhost");    /*    private static Queue celebrate_queue=new Queue();//大V队列    private static void InputSeedIntoQueue(){        celebrate_queue.enQueue("1782270602,286");        celebrate_queue.enQueue("yiwenmu,575");    }   */    public static String getSeedFromRedis() {        // 从Redis中获取数据并出队,uid以及connum,取出后处理分离,取出后即刻销毁        String element = jedis.lpop("follower:uid_connum");        // System.out.println("[DEBUG] : 当前使用了: " + element);        System.out.println("[DEBUG] : 现出队 -------------------- " + element);        return element;    }    public static void writeDataIntoMongo(BasicDBObject document){        MongoClient mongo;        mongo = new MongoClient( "localhost" , 27017 );        DB db = mongo.getDB("Belle");     //得到数据库        DBCollection table = db.getCollection("Following"); //拿到table        table.save(document);    }    public static void main(String[] args) throws IOException {         //InputSeedIntoQueue();        System.out.println("成功读取种子文件");        while(true){            String uid_fonum=celebrate_queue.deQueue();//读种子用户            System.out.println("[DEBUG] 大V "+uid_fonum+" 已出队列");            String[] parts = uid_fonum.split(",");//以逗号分隔开 存入parts字符串数组中            String uid = parts[0];  //分隔的第一个部分就是uid            int following = Integer.parseInt(parts[1]);  //把字符串转转换成整型            System.out.println("[DEBUG]----开始爬取-----"+uid_fonum+"-------");                     crawler_and_toMongo(uid,following);   //种子用户开始爬        }    }    private static void crawler_and_toMongo(String uid, int following) {        float total_number = (float) (following*1.0/32 +3);  //一页32个关注用户        ArrayList<String> certain_celebrate_list = new ArrayList();        for(int i=1;i<total_number;i++){            System.out.println("The current uid is "+uid+" and the page number is "+i);             String following_url = "http://tw.weibo.com/"+uid+"/follow/p/"+i;            Connection con = Jsoup.connect("http://tw.weibo.com/api/user/follow");//和url建立连接            con.data("uid",uid);                           con.data("page",String.valueOf(i));            con.data("currentuid","1991953413");              con.data("page_size","32");            //设了一个cookie 以新浪用户登录的            con.header("Cookie","SINAGLOBAL=9711952802808.682.1499668084041; __gads=ID=130480cbf75a5ad6:T=1500258878:S=ALNI_MZRcFnJveJoxtD4TBbJNbD__KcD-A; UM_distinctid=15d4e658b271f5-0573ee5fbac68c-1b1d7751-1fa400-15d4e658b28944; wvr=6; UOR=,,www.shejidaren.com; _ga=GA1.2.84290483.1500258858; _gid=GA1.2.952496907.1501988163; crtg_rta=; SSOLoginState=1502013392; SCF=Agl4g4wcoIanRFGqnVTCrYVRKcGcCSGCXik1olhn6h9oPJpHfvjbBZyqPuHU-ibuQXyHNclynoHBANDkf_WRtWc.; SUB=_2A250gpeCDeRhGedH4lMY9S3Nzz-IHXVX-Y5KrDV8PUNbmtBeLUWjkW9XZkSwKQAYTJkZTvK42M-ekE_Hnw..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWEbrh.i5bvbnSvvbQHi2M35JpX5KMhUgL.Fo241K24SKepShe2dJLoIpjLxKML1KBLBo5LxK-LB--LB-2LxK-L1h-L1h2t; SUHB=0LcjYQm8zbqhfQ; ALF=1533549388; _s_tentry=-; Apache=9574205823039.291.1502013327944; ULV=1502013327950:6:1:2:9574205823039.291.1502013327944:1501421673576");            con.header("Referer",following_url);  //关注            org.jsoup.nodes.Document doc;            try {                doc = con.post();                for (Element e: doc.getElementsByClass("fwBox") ){                    String following_url_redirect = e.attr("href");                    System.out.println(following_url_redirect);                    /////////////////////////去找每一个关注的人的粉丝数 判断其是不是大V                    Connection con_direct = Jsoup.connect(following_url_redirect);//和url建立连接                    con_direct.data("uid",uid);                                   con_direct.userAgent("Mozilla");                    con_direct.data("currentuid","1991953413");                      con_direct.data("page_size","32");                    //设了一个cookie 以新浪用户登录的                    con_direct.header("Cookie","SINAGLOBAL=9711952802808.682.1499668084041; __gads=ID=130480cbf75a5ad6:T=1500258878:S=ALNI_MZRcFnJveJoxtD4TBbJNbD__KcD-A; UM_distinctid=15d4e658b271f5-0573ee5fbac68c-1b1d7751-1fa400-15d4e658b28944; wvr=6; UOR=,,www.shejidaren.com; _ga=GA1.2.84290483.1500258858; _gid=GA1.2.952496907.1501988163; crtg_rta=; SSOLoginState=1502013392; SCF=Agl4g4wcoIanRFGqnVTCrYVRKcGcCSGCXik1olhn6h9oPJpHfvjbBZyqPuHU-ibuQXyHNclynoHBANDkf_WRtWc.; SUB=_2A250gpeCDeRhGedH4lMY9S3Nzz-IHXVX-Y5KrDV8PUNbmtBeLUWjkW9XZkSwKQAYTJkZTvK42M-ekE_Hnw..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWEbrh.i5bvbnSvvbQHi2M35JpX5KMhUgL.Fo241K24SKepShe2dJLoIpjLxKML1KBLBo5LxK-LB--LB-2LxK-L1h-L1h2t; SUHB=0LcjYQm8zbqhfQ; ALF=1533549388; _s_tentry=-; Apache=9574205823039.291.1502013327944; ULV=1502013327950:6:1:2:9574205823039.291.1502013327944:1501421673576");//                  con_direct.header("Referer","http://tw.weibo.com/libingbing/follow/p/12");//从李冰冰的微博关注名单的12页开始发送请求                    con_direct.header("Referer",following_url_redirect);  //关注                    org.jsoup.nodes.Document doc_direct;                try{                           doc_direct=con_direct.get();                   String   celebrant_following; //关注人数                   String   following_celebrant_uid;//大Vuid                    for (Element e1: doc_direct.select("#mInfo").select("ul").select("li.fansNum").select("a").select("strong")){                            String follower_num = e1.text();//抓取粉丝数                            if(Long.parseLong(follower_num)>1000000)//粉丝数如果大于100万  则为大V                            {                                //从url里截取uid                                following_celebrant_uid=following_url_redirect.substring(following_url_redirect.lastIndexOf("/")+1,following_url_redirect.length());                                //爬取去该大V的关注数                               Elements e2=doc_direct.select("#mInfo").select("ul").select("li.followNum").select("a").select("strong");                               celebrant_following=e2.text();                                //将新爬取的大V的uid和其关注数 连接成新的字符串                                String new_uid_fonum=following_celebrant_uid+","+celebrant_following;                                //将新爬取的大V的uid和其关注数 入队列                                celebrate_queue.enQueue(new_uid_fonum);                             System.out.println("[DEBUG]"+following_celebrant_uid+"   是大V且已经入队");                             certain_celebrate_list.add(following_celebrant_uid);                            }                        }                     String uids = Joiner.on(",").join(certain_celebrate_list);                        BasicDBObject document = new BasicDBObject();                        document.put("_id", uid); //该用户的uid                        document.put("following_celebrant", certain_celebrate_list);                        writeDataIntoMongo(document);                        //System.out.println("[DEBUG]"+following_celebrant_uid+"以入mongo");                    timeDelay(1,3);//最小为5 最大为15    单线程        } catch (IOException e1) {                // TODO Auto-generated catch block                e1.printStackTrace();            }                    }            }catch (IOException e1) {                // TODO Auto-generated catch block                //e1.printStackTrace();                timeDelay(30,35);                continue;            }        }    }    public static void timeDelay(float min, float max){        int random = (int)(max * Math.random() + min);        try {            Thread.sleep(random * 1000);//线程睡觉        } catch (InterruptedException e) {            // TODO Auto-generated catch block            e.printStackTrace();        }}}
原创粉丝点击