Spider for UCI Machine Learning Repository
来源:互联网 发布:大华网络监控 编辑:程序博客网 时间:2024/05/01 12:31
原谅我用英文标题。。。因为我觉得用英文来表示更贴切一些。。这个爬虫用于搜集UCI的机器学习知识库的一些背景资料和下载链接。主要是对jsoup包的运用,然后用jxl包把资料写入到excel里面。
每个数据集都有下列信息:
Name,AssociatedTasks,AttributeCharacteristics,NumberOfInstances,
NumberOfAttributes,DataFull,Year,HitTimes,DataSetInformation,
AttributeInformation, DownloadLink
如果有信息缺失,则用-1或者N/A代替
保存结果的Excel被放置在桌面上,名字为 “aaa.xls”
现在版本的缺陷就是数据集的名字显示有点问题,空格他自动用+号代替,左括号用%28代替,右括号用%29代替。暂时还没有改过来。
下面是代码
import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import java.io.* ;import jxl.Workbook; import jxl.read.biff.BiffException;import jxl.write.Label ;import jxl.write.WritableSheet;import jxl.write.WritableWorkbook;import jxl.write.WriteException;import jxl.write.biff.RowsExceededException;public class UCIData { //类的成员 public int Num ; public String Name ; public String AssociatedTasks ; public String AttributeCharacteristics ; public int NumberofInstances ; public int NumberofAttributes ; public boolean DataFull ; public int Year ; public int HitTimes ; public String DataSetInformation ; public String AttributeInformation ; public String DownloadLink ; public static void main(String[] args) throws IOException, RowsExceededException, WriteException { String filename="C:\\Users\\multiangle\\Desktop\\aaa.xls" ; try { ExcelTitlePrint(filename) ; } catch (RowsExceededException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (WriteException e) { // TODO Auto-generated catch block e.printStackTrace(); } String url="http://archive.ics.uci.edu/ml/datasets.html" ; Document doc=getPage(url) ; UCIData[] dataset ; dataset=FindData(doc) ; } public UCIData(){ //Construction Method Num=-1 ; Name=null ; AssociatedTasks=null ; AttributeCharacteristics=null ; NumberofInstances=-1 ; NumberofAttributes=-1 ; DataFull=false ; Year=-1 ; HitTimes=-1 ; DataSetInformation=null ; AttributeInformation=null ; DownloadLink=null ; } public static UCIData[] FindData(Document doc) throws IOException, RowsExceededException, WriteException{ /** * 用来搜集各个数据集的信息 */ Element table=doc.getElementsByTag("table").get(1); Element td=table.getElementsByTag("td").first().nextElementSibling(); Element table2=td.getElementsByTag("table").first().nextElementSibling(); int listnum=table2.getElementsByTag("tr").first().siblingElements().size() ; UCIData[] dataset=new UCIData[listnum] ; //listnum=10 ; //&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&7此处运行时删掉 for(int i=0;i<listnum;i++){ Element setlist=table2.getElementsByTag("tr").first().siblingElements().get(i) ; //具体到条 Element td2=setlist.getElementsByTag("a").first() ; String name=td2.attr("href").substring(9) ; //还可以提升,把+号和%去掉 System.out.println("正在搜集第"+i+"个数据集: "+name); String link=td2.absUrl("href") ; Document subpage=getPage(link) ; dataset[i]=InfoCutPage(subpage); //补充datase[i]的各项值 dataset[i].Num=i+1 ; dataset[i].Name=name ; print (dataset[i]) ; ExcelDataPrint(dataset[i],"C:\\Users\\multiangle\\Desktop\\aaa.xls") ; } return dataset ; } public static UCIData InfoCutPage(Document doc) throws IOException{ /** * 用来搜集单个数据集的信息 */ UCIData data=new UCIData() ; Element table=doc.getElementsByTag("table").first().siblingElements().get(1) ; Element td=table.getElementsByTag("td").first(); Element table2=td.getElementsByTag("table").first().nextElementSibling() ; //Simple Information Element tr=table2.getElementsByTag("tr").first() ; //the 1st line of simple information Element td2=tr.getElementsByTag("td").first().siblingElements().get(2) ; if(!td2.text().equals("N/A")) data.NumberofInstances= Integer.parseInt(td2.text()) ; tr=tr.nextElementSibling() ; //the 2ed line of simple information td2=tr.getElementsByTag("td").first().nextElementSibling(); //AttributeCharacteristics data.AttributeCharacteristics=td2.text(); td2=td2.nextElementSibling().nextElementSibling() ; //NumberofAttributes if(!td2.text().equals("N/A")) data.NumberofAttributes=Integer.parseInt(td2.text()) ; td2=td2.nextElementSibling().nextElementSibling() ; //Date if (!td2.text().equals("N/A")) data.Year=Integer.parseInt(td2.text().substring(0,4)) ; tr=tr.nextElementSibling() ; //the 3rd line of simple information td2=tr.getElementsByTag("td").first().nextElementSibling(); //Associated Tasks data.AssociatedTasks=td2.text() ; td2=td2.nextElementSibling().nextElementSibling() ; //DataFull if (td2.text().equals("No")) data.DataFull=true ; else data.DataFull=false ; td2=td2.nextElementSibling().nextElementSibling() ; //HitTimes data.HitTimes=Integer.parseInt(td2.text()) ; Element p=table2.siblingElements().get(6) ; data.DataSetInformation=p.text() ; //DataSetInformation p=p.nextElementSibling().nextElementSibling().nextElementSibling() ; data.AttributeInformation=p.text() ; //AttributeInformation String downlink_pre=cutPage1(doc) ; data.DownloadLink=downlink_pre ; return data ; } public static String cutPage1(Document doc) throws IOException{ Element font=doc.getElementsByTag("font").get(5).parent() ; String link=font.absUrl("href") ; return link ; } public static String cutPage2(Document doc) throws IOException{ Element tr=doc.getElementsByTag("tr").get(3) ; Element link=tr.getElementsByTag("a").first() ; String href=link.absUrl("href") ; //System.out.println(href); return href ; } public static Document getPage_inner(String url) throws IOException{ Document doc ; try{ doc=Jsoup.connect(url).get() ; return doc ; }catch(IOException e){ return null; } } public static Document getPage(String url) throws IOException{ /** * 用来获取页面代码,与上面的getPage_inner一道,有多次重连功能 */ try{ Document doc ; int times=0 ; //System.out.println("正在请求获取网页"+url); doc=getPage_inner(url) ; while(doc.equals(null)&×<8){ doc=getPage_inner(url) ; System.out.println("第"+times+"次请求失败,正在进行第"+(++times)+"次请求"); } if (doc.equals(null)){ System.out.println("ERROR:获取网页失败"); return null ; }else{ return doc ; } }catch(IOException e){ return null ; } } public static void print(UCIData data){ System.out.println("Num: "+data.Num) ; System.out.println("Name: "+data.Name) ; System.out.println("AssociatedTasks: "+data.AssociatedTasks) ; System.out.println("AttributeCharacteristics: "+data.AttributeCharacteristics) ; System.out.println("NumberofInstances: "+data.NumberofInstances) ; System.out.println("NumberofAttributes: "+data.NumberofAttributes) ; System.out.println("DataFull: "+data.DataFull) ; System.out.println("Year: "+data.Year) ; System.out.println("HitTimes: "+data.HitTimes) ; System.out.println("DataSetInformation: "+data.DataSetInformation) ; System.out.println("AttributeInformation: "+data.AttributeInformation) ; System.out.println("DownloadLink: "+data.DownloadLink) ; } public static void ExcelDataPrint(UCIData data,String filename) throws RowsExceededException, WriteException { try { File file=new File(filename) ; Workbook wb = Workbook.getWorkbook(file); WritableWorkbook book = Workbook.createWorkbook(file,wb); WritableSheet sheet = book.getSheet(0) ; int rownum=data.Num+1; int colnum=0 ; jxl.write.Number num=new jxl.write.Number(colnum++,rownum,data.Num) ; Label name=new Label(colnum++,rownum,data.Name); Label associatetasks=new Label(colnum++,rownum,data.AssociatedTasks); Label attritebutecharacters=new Label(colnum++,rownum,data.AttributeCharacteristics); jxl.write.Number instance=new jxl.write.Number(colnum++,rownum,data.NumberofInstances) ; jxl.write.Number attributes=new jxl.write.Number(colnum++,rownum,data.NumberofAttributes) ; String datafull ; if (data.DataFull==true) datafull="true" ; else datafull="false" ; Label dataful=new Label(colnum++,rownum,datafull) ; jxl.write.Number year=new jxl.write.Number(colnum++,rownum,data.Year) ; jxl.write.Number hittimes=new jxl.write.Number(colnum++,rownum,data.HitTimes) ; Label datasetinfo=new Label(colnum++,rownum,data.DataSetInformation); Label attributeinfo=new Label(colnum++,rownum,data.AttributeInformation); Label downlink=new Label(colnum++,rownum,data.DownloadLink); sheet.addCell(num); sheet.addCell(name); sheet.addCell(associatetasks); sheet.addCell(attritebutecharacters); sheet.addCell(instance); sheet.addCell(attributes); sheet.addCell(dataful); sheet.addCell(year); sheet.addCell(hittimes); sheet.addCell(datasetinfo); sheet.addCell(attributeinfo); sheet.addCell(downlink); book.write(); book.close(); } catch (BiffException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static void ExcelTitlePrint(String filename) throws RowsExceededException, WriteException{ try { File file=new File(filename) ; WritableWorkbook book=Workbook.createWorkbook(file); WritableSheet sheet=book.createSheet("FirstPage",0) ; String[] colname={"Num" ,"Name","AssociatedTasks", "AttributeCharacteristics","NumberOfInstances","NumberOfAttributes", "DataFull","Year","HitTimes", "DataSetInformation","AttributeInformation","DownloadLink"}; for(int i=0;i<colname.length;i++){ Label label=new Label(i,0,colname[i]) ; sheet.addCell(label); } book.write(); book.close(); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("ERROR:创建Excel文件失败"); } //打开文件 }}
0 0
- Spider for UCI Machine Learning Repository
- UCI Machine learning
- Learning Path for Machine Learning
- Software for machine learning
- Mapreduce for Machine Learning
- Mathematics for machine learning
- python for machine-learning
- Resource for Machine Learning
- advice for machine learning
- Open Tools for Machine Learning
- Machine Learning Library for Python
- 特征工程 for machine learning
- Code for Machine Learning Diagnostic
- Machine Learning Pipelines for R
- dual learning for machine translation
- Neural Networks for Machine Learning
- Stanford Machine Learning: (4). Advice for applying Machine Learning
- Machine Learning week 6 quiz: Advice for Applying Machine Learning
- 获取指定窗口信息(坐标,窗口风格)
- PHP开发经验
- PowerDesigner使用方法
- Java之BigInteger(面试题12:打印1到最大的n位数)
- hive学习笔记
- Spider for UCI Machine Learning Repository
- 如何使用crystal report显示多张报表
- 如何定制你自己的jQuery
- 兔子-myeclipse中的中文是横着的
- 【cocos2d-x-3.2】【高仿微信打飞机系列二】【敌机 碰撞检测 爆炸管理】
- js匿名函数作为函数参数
- 汉字英文混合字符串 截取指定字符串长度
- 关于error LNK2019: unresolved external symbol错误
- C++stl vector 用法