抓取Foursquare网页信息的爬虫

来源:互联网 发布:tensorflow可视化 编辑:程序博客网 时间:2024/04/30 12:33

总结下最近完成的一个爬虫,具体就不说了,代码贴出来,需要的同学可以拿去玩玩。

Foursquare最大的问题是动态网页,就是会所网址不变,但内容在变。这样的化,用Jsoup就无能为力了

因此我使用了Selenium去解决动态网页的问题,代码调试的时候大家记得需要导入这个文件。

单个地点抓取类:

package Test1;import java.net.UnknownHostException;import com.mongodb.BasicDBObject;import com.thoughtworks.selenium.*;//This is the driver's import. You'll use this for instantiating a//browser and making it do what you need.import org.jsoup.Jsoup;import org.jsoup.helper.Validate;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;import com.mongodb.BasicDBObject;import com.mongodb.DB;import com.mongodb.DBCollection;import com.mongodb.DBCursor;import com.mongodb.DBObject;import com.mongodb.Mongo;import com.mongodb.MongoException;import com.mongodb.util.JSON;import java.util.regex.Pattern;@SuppressWarnings("deprecation")public class GetELocation extends SeleneseTestCase {/*get the information for every location and save it into mongodb*/public String url;public Document doc;public DBCollection collection;public BasicDBObject document = new BasicDBObject();public void LinkMongodb() throws Exception {Mongo mongo = new Mongo("localhost", 27017);    DB db = mongo.getDB("FourS2");    collection = db.getCollection("Foursquare");    System.out.println("Link Mongodb!");}public void setUp() throws Exception {     setUp(url, "*firefox");//opne the international web.System.out.println("Open firefox!");   }public void Openurl() throws Exception {selenium.open(url);Thread.sleep(30000);String str = selenium.getHtmlSource();//get the source of html for the web.doc = Jsoup.parse(str);//chang this str into jsoup can read the document.System.out.println("Open the url and Save doc!");}public void Savetitle() throws Exception {String title = new String(doc.title());         document.put("Title", title);        //get the title of this pages.        System.out.println("Save Title of Page!");}public void SaveScore() throws Exception {Elements Class = doc.select(".rating>span");        String score= new String(Class.text());        document.put("Score", score);        //get the score of one place;        System.out.println("Save Score of location!");}public void SaveSimilar() throws Exception {//save the similar place for one special place;BasicDBObject temp = new BasicDBObject();Elements Similar = doc.select("#similarVenues>a");                int count = 0;        for (Element Place : Similar){                 String str = new String(Place.text());         count++;         String tempstr = String.valueOf(count);         temp.put(tempstr, str);        }        //String SimilarPlace = new String(Similar.text());        document.put("Similar", temp);        System.out.println("Save similar of palces!");}public void SaveEvents() throws Exception {//save the events about this place.Elements Event = doc.select("#exploreNearby>a");        int count = 0;        BasicDBObject temp = new BasicDBObject();        for (Element every : Event){                String str = new String(every.text());        count++;        String tempint = String.valueOf(count);        temp.put(tempint, str);        }        document.put("Events", temp);        System.out.println("Save the events of this place!");}public void SaveTips() throws Exception {//save the tips about this place.BasicDBObject temp = new BasicDBObject();int n = (Integer)selenium.getXpathCount("//span[@class='page-node']");//count = count-1;Elements tips = doc.select(".tipText");        int count = 0;                for (Element link : tips){                String str = new String(link.text());        count++;        String tempint = String.valueOf(count);        temp.put(tempint, str);        //print("%s \r\n", link.text());        }        //save the fires page tips.        for(int i=1;i<n;i++){selenium.click("xpath=//span[@page-num='"+i+"']");Thread.sleep(30000);String str2 = selenium.getHtmlSource();Document tempdoc = Jsoup.parse(str2);Elements temptips = tempdoc.select(".tipText");//count = 0;for(Element link : temptips){String str = new String(link.text());count++;String tempint = String.valueOf(count);temp.put(tempint, str);}}document.put("Tips", temp);System.out.println("Save tips of this places!");}public void SavePhotos() throws Exception{BasicDBObject temp = new BasicDBObject();int count;count = 0;selenium.click("xpath=//a[@class='photosLink']");        Thread.sleep(3000);               for (int second = 0;; second++) {            if(second >=180){                break;            }            selenium.getEval("window.scrollBy(0,400)");             Thread.sleep(3000);        }                try{               int n = (Integer)selenium.getXpathCount("//div[@class='photo']");        System.out.println(n);        for (int i=1;i<n;i++){            String str2 = selenium.getAttribute("xpath=(//div[@class='photo'])["+i+"]/span/img@src");            //print("%s", str2);            //System.out.println(second);            count++;            String tempint = String.valueOf(count);temp.put(tempint, str2);            }        }catch(Exception e){        e.printStackTrace();                System.out.println(url);        }                document.put("Photos", temp);        System.out.println("Save the photo of this places!");}public void SaveCategories() throws Exception {String str = selenium.getText("xpath=//p[@class='categories']");document.put("Categories", str);System.out.println("Save the categories!");}public void Insertdata() throws Exception {collection.insert(document);System.out.println("Insert the data into Mongodb!");} public void CloseBrower() {        selenium.stop();}}

单个地点Similar Place地点抓取:

package Test1;import java.net.UnknownHostException;import com.mongodb.BasicDBObject;import com.thoughtworks.selenium.*;//This is the driver's import. You'll use this for instantiating a//browser and making it do what you need.import org.jsoup.Jsoup;import org.jsoup.helper.Validate;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;import java.util.LinkedList;import java.util.Queue;import java.util.regex.Pattern;//Selenium-IDE add the Pattern module because it's sometimes used for//regex validations. You can remove the module if it's not used in your//script.@SuppressWarnings("deprecation")public class GetSimilarPlace extends SeleneseTestCase {   public String url;      public void setUp() throws Exception {        setUp(url, "*firefox");   }      public Queue<String> PushQueue(Queue<String> queue) throws Exception {       selenium.open(url);    selenium.windowMaximize();        try{        //url = selenium.getLocation();        //Queue<String> queue = new LinkedList<String>();         int n = (Integer)selenium.getXpathCount("//div[@id='similarVenues']//a");        System.out.println(n);        for (int i=1;i<n;i++){            String str2 = selenium.getAttribute("xpath=(//div[@id='similarVenues']//a)["+i+"]@href");            String temp = new String("https://foursquare.com"+str2);            queue.offer(temp);            //print("%s", temp);            //System.out.println(second);            }                //print("\r\n");                //String str;        //while((str=queue.poll())!=null){              //    print("%s", str);             //}              //System.out.println();              //System.out.println(queue.size());                      }catch(Exception e){                e.printStackTrace();                }        return queue;   }       public static void print(String msg, Object... args) {           System.out.println(String.format(msg, args));}        public void CloseBrower() {        selenium.stop();    }}

总体函数:

package Test1;import com.thoughtworks.selenium.*; //import com.thoughtworks.selenium.*;//This is the driver's import. You'll use this for instantiating a//browser and making it do what you need.import java.math.BigInteger;import java.security.MessageDigest;import java.util.regex.Pattern;import java.util.Iterator;import java.util.List;import java.util.ArrayList;import java.util.LinkedList;import java.util.Queue;import java.util.regex.Pattern;import java.util.Iterator;import java.util.Set;import java.util.TreeSet;import java.io.FileInputStream;import java.security.MessageDigest;import java.math.*;import java.security.*;import org.jsoup.Jsoup;public class SaveOPage extends SeleneseTestCase {public static String seed = new String("https://foursquare.com/v/singapore-zoo/4b05880ef964a520b8ae22e3");public static Queue<String> queue = new LinkedList<String>();public static Set<String> s = new TreeSet<String>();public void GetOnePage(String url) {GetELocation st = new GetELocation();//st.print("124");//SaveOPage st = new SaveOPage();     try {         st.url = new String(url);st.setUp();st.LinkMongodb();st.Openurl();st.Savetitle();st.SaveSimilar();st.SaveScore();st.SaveEvents();st.SaveSimilar();st.SaveCategories();st.SaveTips();st.SavePhotos();st.Insertdata();st.CloseBrower();} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();}    }public void GetSimilar(String url) {GetSimilarPlace gs = new GetSimilarPlace();try {gs.url = url;gs.setUp();queue = gs.PushQueue(queue);gs.CloseBrower();} catch (Exception e) {e.printStackTrace();} }public static void main(String[] args) {       SaveOPage sp = new SaveOPage();queue.offer(seed);//sp.GetOnePage(seed);//sp.GetSimilar(seed);System.out.println(1);String url;int count = 0;while((url=queue.poll())!=null){          count++;if (count>3){break;}Set<String> temp = new TreeSet<String>();temp = s;String md5 = new String();try{MessageDigest mdEnc = MessageDigest.getInstance("MD5");     mdEnc.update(url.getBytes(), 0, url.length());    md5 = new BigInteger(1, mdEnc.digest()).toString(16);    System.out.println(md5);    //for md5 code.} catch (Exception e) {e.printStackTrace();}    s.add(md5);System.out.println(url);if (s.equals(temp)){System.out.println("Start");sp.GetSimilar(url);sp.GetOnePage(url);}    }  }}


抓取的数据我存在了mongodb中,程序同样需要这个包,运行时注意。代码不多,希望对大家有帮助。
原创粉丝点击