抓取Foursquare网页信息的爬虫
来源:互联网 发布:tensorflow可视化 编辑:程序博客网 时间:2024/04/30 12:33
总结下最近完成的一个爬虫,具体就不说了,代码贴出来,需要的同学可以拿去玩玩。
Foursquare最大的问题是动态网页,就是会所网址不变,但内容在变。这样的化,用Jsoup就无能为力了
因此我使用了Selenium去解决动态网页的问题,代码调试的时候大家记得需要导入这个文件。
单个地点抓取类:
package Test1;import java.net.UnknownHostException;import com.mongodb.BasicDBObject;import com.thoughtworks.selenium.*;//This is the driver's import. You'll use this for instantiating a//browser and making it do what you need.import org.jsoup.Jsoup;import org.jsoup.helper.Validate;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;import com.mongodb.BasicDBObject;import com.mongodb.DB;import com.mongodb.DBCollection;import com.mongodb.DBCursor;import com.mongodb.DBObject;import com.mongodb.Mongo;import com.mongodb.MongoException;import com.mongodb.util.JSON;import java.util.regex.Pattern;@SuppressWarnings("deprecation")public class GetELocation extends SeleneseTestCase {/*get the information for every location and save it into mongodb*/public String url;public Document doc;public DBCollection collection;public BasicDBObject document = new BasicDBObject();public void LinkMongodb() throws Exception {Mongo mongo = new Mongo("localhost", 27017); DB db = mongo.getDB("FourS2"); collection = db.getCollection("Foursquare"); System.out.println("Link Mongodb!");}public void setUp() throws Exception { setUp(url, "*firefox");//opne the international web.System.out.println("Open firefox!"); }public void Openurl() throws Exception {selenium.open(url);Thread.sleep(30000);String str = selenium.getHtmlSource();//get the source of html for the web.doc = Jsoup.parse(str);//chang this str into jsoup can read the document.System.out.println("Open the url and Save doc!");}public void Savetitle() throws Exception {String title = new String(doc.title()); document.put("Title", title); //get the title of this pages. System.out.println("Save Title of Page!");}public void SaveScore() throws Exception {Elements Class = doc.select(".rating>span"); String score= new String(Class.text()); document.put("Score", score); //get the score of one place; System.out.println("Save Score of location!");}public void SaveSimilar() throws Exception {//save the similar place for one special place;BasicDBObject temp = new BasicDBObject();Elements Similar = doc.select("#similarVenues>a"); int count = 0; for (Element Place : Similar){ String str = new String(Place.text()); count++; String tempstr = String.valueOf(count); temp.put(tempstr, str); } //String SimilarPlace = new String(Similar.text()); document.put("Similar", temp); System.out.println("Save similar of palces!");}public void SaveEvents() throws Exception {//save the events about this place.Elements Event = doc.select("#exploreNearby>a"); int count = 0; BasicDBObject temp = new BasicDBObject(); for (Element every : Event){ String str = new String(every.text()); count++; String tempint = String.valueOf(count); temp.put(tempint, str); } document.put("Events", temp); System.out.println("Save the events of this place!");}public void SaveTips() throws Exception {//save the tips about this place.BasicDBObject temp = new BasicDBObject();int n = (Integer)selenium.getXpathCount("//span[@class='page-node']");//count = count-1;Elements tips = doc.select(".tipText"); int count = 0; for (Element link : tips){ String str = new String(link.text()); count++; String tempint = String.valueOf(count); temp.put(tempint, str); //print("%s \r\n", link.text()); } //save the fires page tips. for(int i=1;i<n;i++){selenium.click("xpath=//span[@page-num='"+i+"']");Thread.sleep(30000);String str2 = selenium.getHtmlSource();Document tempdoc = Jsoup.parse(str2);Elements temptips = tempdoc.select(".tipText");//count = 0;for(Element link : temptips){String str = new String(link.text());count++;String tempint = String.valueOf(count);temp.put(tempint, str);}}document.put("Tips", temp);System.out.println("Save tips of this places!");}public void SavePhotos() throws Exception{BasicDBObject temp = new BasicDBObject();int count;count = 0;selenium.click("xpath=//a[@class='photosLink']"); Thread.sleep(3000); for (int second = 0;; second++) { if(second >=180){ break; } selenium.getEval("window.scrollBy(0,400)"); Thread.sleep(3000); } try{ int n = (Integer)selenium.getXpathCount("//div[@class='photo']"); System.out.println(n); for (int i=1;i<n;i++){ String str2 = selenium.getAttribute("xpath=(//div[@class='photo'])["+i+"]/span/img@src"); //print("%s", str2); //System.out.println(second); count++; String tempint = String.valueOf(count);temp.put(tempint, str2); } }catch(Exception e){ e.printStackTrace(); System.out.println(url); } document.put("Photos", temp); System.out.println("Save the photo of this places!");}public void SaveCategories() throws Exception {String str = selenium.getText("xpath=//p[@class='categories']");document.put("Categories", str);System.out.println("Save the categories!");}public void Insertdata() throws Exception {collection.insert(document);System.out.println("Insert the data into Mongodb!");} public void CloseBrower() { selenium.stop();}}
单个地点Similar Place地点抓取:
package Test1;import java.net.UnknownHostException;import com.mongodb.BasicDBObject;import com.thoughtworks.selenium.*;//This is the driver's import. You'll use this for instantiating a//browser and making it do what you need.import org.jsoup.Jsoup;import org.jsoup.helper.Validate;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;import java.util.LinkedList;import java.util.Queue;import java.util.regex.Pattern;//Selenium-IDE add the Pattern module because it's sometimes used for//regex validations. You can remove the module if it's not used in your//script.@SuppressWarnings("deprecation")public class GetSimilarPlace extends SeleneseTestCase { public String url; public void setUp() throws Exception { setUp(url, "*firefox"); } public Queue<String> PushQueue(Queue<String> queue) throws Exception { selenium.open(url); selenium.windowMaximize(); try{ //url = selenium.getLocation(); //Queue<String> queue = new LinkedList<String>(); int n = (Integer)selenium.getXpathCount("//div[@id='similarVenues']//a"); System.out.println(n); for (int i=1;i<n;i++){ String str2 = selenium.getAttribute("xpath=(//div[@id='similarVenues']//a)["+i+"]@href"); String temp = new String("https://foursquare.com"+str2); queue.offer(temp); //print("%s", temp); //System.out.println(second); } //print("\r\n"); //String str; //while((str=queue.poll())!=null){ // print("%s", str); //} //System.out.println(); //System.out.println(queue.size()); }catch(Exception e){ e.printStackTrace(); } return queue; } public static void print(String msg, Object... args) { System.out.println(String.format(msg, args));} public void CloseBrower() { selenium.stop(); }}
总体函数:
package Test1;import com.thoughtworks.selenium.*; //import com.thoughtworks.selenium.*;//This is the driver's import. You'll use this for instantiating a//browser and making it do what you need.import java.math.BigInteger;import java.security.MessageDigest;import java.util.regex.Pattern;import java.util.Iterator;import java.util.List;import java.util.ArrayList;import java.util.LinkedList;import java.util.Queue;import java.util.regex.Pattern;import java.util.Iterator;import java.util.Set;import java.util.TreeSet;import java.io.FileInputStream;import java.security.MessageDigest;import java.math.*;import java.security.*;import org.jsoup.Jsoup;public class SaveOPage extends SeleneseTestCase {public static String seed = new String("https://foursquare.com/v/singapore-zoo/4b05880ef964a520b8ae22e3");public static Queue<String> queue = new LinkedList<String>();public static Set<String> s = new TreeSet<String>();public void GetOnePage(String url) {GetELocation st = new GetELocation();//st.print("124");//SaveOPage st = new SaveOPage(); try { st.url = new String(url);st.setUp();st.LinkMongodb();st.Openurl();st.Savetitle();st.SaveSimilar();st.SaveScore();st.SaveEvents();st.SaveSimilar();st.SaveCategories();st.SaveTips();st.SavePhotos();st.Insertdata();st.CloseBrower();} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();} }public void GetSimilar(String url) {GetSimilarPlace gs = new GetSimilarPlace();try {gs.url = url;gs.setUp();queue = gs.PushQueue(queue);gs.CloseBrower();} catch (Exception e) {e.printStackTrace();} }public static void main(String[] args) { SaveOPage sp = new SaveOPage();queue.offer(seed);//sp.GetOnePage(seed);//sp.GetSimilar(seed);System.out.println(1);String url;int count = 0;while((url=queue.poll())!=null){ count++;if (count>3){break;}Set<String> temp = new TreeSet<String>();temp = s;String md5 = new String();try{MessageDigest mdEnc = MessageDigest.getInstance("MD5"); mdEnc.update(url.getBytes(), 0, url.length()); md5 = new BigInteger(1, mdEnc.digest()).toString(16); System.out.println(md5); //for md5 code.} catch (Exception e) {e.printStackTrace();} s.add(md5);System.out.println(url);if (s.equals(temp)){System.out.println("Start");sp.GetSimilar(url);sp.GetOnePage(url);} } }}
- 抓取Foursquare网页信息的爬虫
- python3爬虫--抓取网页信息
- Python爬虫学习,抓取网页上的天气信息
- 抓取教程网页的小爬虫
- 爬虫的自我解剖(抓取网页HtmlUnit)
- 爬虫的自我解剖(抓取网页HtmlUnit)
- 爬虫的自我解剖(抓取网页HtmlUnit)
- Python爬虫之抓取豆瓣信息 全部网页显示
- python——爬虫实现网页信息抓取
- C# 使用 Abot 实现 爬虫 抓取网页信息 源码下载
- 抓取防爬虫的网站信息
- Java爬虫,信息抓取的实现
- Java爬虫,信息抓取的实现
- Java爬虫,信息抓取的实现
- Java爬虫,信息抓取的实现
- Java爬虫,信息抓取的实现
- Java爬虫,信息抓取的实现
- Java爬虫,信息抓取的实现
- socket学习之电脑手机通信
- Migrating Queries from One User To Another从一个用户迁移到另一个查询
- 一个简单的计算从1到1000000000所花时间的小程序
- Jquery常用技巧和方法收集
- 子网掩码 子网划分
- 抓取Foursquare网页信息的爬虫
- 京东陷入融资与扩张怪圈
- 求四个数的最大公约数
- VirtualBox 中共享文件夹的设置
- 堆和栈的区别
- 连连看游戏。。c++编写。。
- treeview 控件点击父节点所有子节点全部展开代码,改变部分节点颜色
- 模版工厂模式
- Android系统文件夹结构解析(一)--/system/app