Spider for UCI Machine Learning Repository

来源：互联网发布：大华网络监控编辑：程序博客网时间：2024/05/01 12:31
原谅我用英文标题。。。因为我觉得用英文来表示更贴切一些。。这个爬虫用于搜集UCI的机器学习知识库的一些背景资料和下载链接。主要是对jsoup包的运用，然后用jxl包把资料写入到excel里面。
每个数据集都有下列信息：
Name,AssociatedTasks,AttributeCharacteristics,NumberOfInstances,
NumberOfAttributes,DataFull,Year,HitTimes,DataSetInformation,
AttributeInformation, DownloadLink
如果有信息缺失，则用-1或者N/A代替
保存结果的Excel被放置在桌面上，名字为 “aaa.xls”
现在版本的缺陷就是数据集的名字显示有点问题，空格他自动用+号代替，左括号用%28代替，右括号用%29代替。暂时还没有改过来。
下面是代码
import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import java.io.* ;import jxl.Workbook; import jxl.read.biff.BiffException;import jxl.write.Label ;import jxl.write.WritableSheet;import jxl.write.WritableWorkbook;import jxl.write.WriteException;import jxl.write.biff.RowsExceededException;public class UCIData {    //类的成员    public int Num ;     public String Name ;    public String AssociatedTasks ;    public String AttributeCharacteristics ;    public int NumberofInstances ;    public int NumberofAttributes ;    public boolean DataFull ;    public int Year ;    public int HitTimes ;    public String DataSetInformation ;    public String AttributeInformation ;    public String DownloadLink ;    public static void main(String[] args) throws IOException, RowsExceededException, WriteException {        String filename="C:\\Users\\multiangle\\Desktop\\aaa.xls" ;        try {            ExcelTitlePrint(filename) ;        } catch (RowsExceededException e) {            // TODO Auto-generated catch block            e.printStackTrace();        } catch (WriteException e) {            // TODO Auto-generated catch block            e.printStackTrace();        }        String url="http://archive.ics.uci.edu/ml/datasets.html" ;        Document doc=getPage(url) ;        UCIData[] dataset ;        dataset=FindData(doc) ;    }    public UCIData(){       //Construction Method        Num=-1 ;        Name=null ;        AssociatedTasks=null ;        AttributeCharacteristics=null ;        NumberofInstances=-1 ;        NumberofAttributes=-1 ;        DataFull=false ;        Year=-1 ;        HitTimes=-1 ;        DataSetInformation=null ;        AttributeInformation=null ;        DownloadLink=null ;    }    public static UCIData[] FindData(Document doc) throws IOException, RowsExceededException, WriteException{        /**        *   用来搜集各个数据集的信息        */        Element table=doc.getElementsByTag("table").get(1);        Element td=table.getElementsByTag("td").first().nextElementSibling();        Element table2=td.getElementsByTag("table").first().nextElementSibling();        int listnum=table2.getElementsByTag("tr").first().siblingElements().size() ;        UCIData[] dataset=new UCIData[listnum] ;        //listnum=10 ; //&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&7此处运行时删掉        for(int i=0;i<listnum;i++){            Element setlist=table2.getElementsByTag("tr").first().siblingElements().get(i) ;  //具体到条            Element td2=setlist.getElementsByTag("a").first() ;            String name=td2.attr("href").substring(9) ;  //还可以提升,把+号和%去掉            System.out.println("正在搜集第"+i+"个数据集：  "+name);             String link=td2.absUrl("href") ;            Document subpage=getPage(link) ;            dataset[i]=InfoCutPage(subpage);  //补充datase[i]的各项值            dataset[i].Num=i+1 ;            dataset[i].Name=name ;            print (dataset[i]) ;            ExcelDataPrint(dataset[i],"C:\\Users\\multiangle\\Desktop\\aaa.xls") ;        }        return dataset ;    }    public static UCIData InfoCutPage(Document doc) throws IOException{          /**        *   用来搜集单个数据集的信息        */        UCIData data=new UCIData() ;        Element table=doc.getElementsByTag("table").first().siblingElements().get(1) ;        Element td=table.getElementsByTag("td").first();        Element table2=td.getElementsByTag("table").first().nextElementSibling() ; //Simple Information            Element tr=table2.getElementsByTag("tr").first() ;      //the 1st line of simple information                Element td2=tr.getElementsByTag("td").first().siblingElements().get(2) ;                if(!td2.text().equals("N/A")) data.NumberofInstances= Integer.parseInt(td2.text()) ;            tr=tr.nextElementSibling() ;                            //the 2ed line of simple information                td2=tr.getElementsByTag("td").first().nextElementSibling();  //AttributeCharacteristics                data.AttributeCharacteristics=td2.text();                td2=td2.nextElementSibling().nextElementSibling() ;          //NumberofAttributes                if(!td2.text().equals("N/A")) data.NumberofAttributes=Integer.parseInt(td2.text()) ;                 td2=td2.nextElementSibling().nextElementSibling() ;          //Date                if (!td2.text().equals("N/A")) data.Year=Integer.parseInt(td2.text().substring(0,4)) ;            tr=tr.nextElementSibling() ;                            //the 3rd line of simple information                td2=tr.getElementsByTag("td").first().nextElementSibling();  //Associated Tasks                data.AssociatedTasks=td2.text() ;                td2=td2.nextElementSibling().nextElementSibling() ;          //DataFull                if (td2.text().equals("No")) data.DataFull=true ;                else data.DataFull=false ;                td2=td2.nextElementSibling().nextElementSibling() ;          //HitTimes                data.HitTimes=Integer.parseInt(td2.text()) ;        Element p=table2.siblingElements().get(6) ;            data.DataSetInformation=p.text() ;                      //DataSetInformation        p=p.nextElementSibling().nextElementSibling().nextElementSibling() ;            data.AttributeInformation=p.text() ;                    //AttributeInformation        String downlink_pre=cutPage1(doc) ;        data.DownloadLink=downlink_pre ;        return data ;    }    public static String cutPage1(Document doc) throws IOException{        Element font=doc.getElementsByTag("font").get(5).parent() ;        String link=font.absUrl("href") ;        return link ;    }    public static String cutPage2(Document doc) throws IOException{        Element tr=doc.getElementsByTag("tr").get(3) ;        Element link=tr.getElementsByTag("a").first() ;        String href=link.absUrl("href") ;        //System.out.println(href);        return href ;    }     public static Document getPage_inner(String url) throws IOException{        Document doc ;        try{            doc=Jsoup.connect(url).get() ;            return doc ;        }catch(IOException e){            return null;        }    }    public static Document getPage(String url) throws IOException{        /**        *   用来获取页面代码，与上面的getPage_inner一道，有多次重连功能        */        try{            Document doc ;            int times=0 ;            //System.out.println("正在请求获取网页"+url);            doc=getPage_inner(url) ;            while(doc.equals(null)&&times<8){                doc=getPage_inner(url) ;                System.out.println("第"+times+"次请求失败，正在进行第"+(++times)+"次请求");            }            if (doc.equals(null)){                System.out.println("ERROR:获取网页失败");                return null ;            }else{                return doc ;            }        }catch(IOException e){            return null ;        }    }    public static void print(UCIData data){          System.out.println("Num:  "+data.Num) ;        System.out.println("Name:  "+data.Name) ;        System.out.println("AssociatedTasks:  "+data.AssociatedTasks) ;        System.out.println("AttributeCharacteristics:  "+data.AttributeCharacteristics) ;        System.out.println("NumberofInstances:  "+data.NumberofInstances) ;        System.out.println("NumberofAttributes:  "+data.NumberofAttributes) ;        System.out.println("DataFull:  "+data.DataFull) ;        System.out.println("Year:  "+data.Year) ;        System.out.println("HitTimes:  "+data.HitTimes) ;        System.out.println("DataSetInformation:  "+data.DataSetInformation) ;        System.out.println("AttributeInformation:  "+data.AttributeInformation) ;        System.out.println("DownloadLink:  "+data.DownloadLink) ;    }    public static void ExcelDataPrint(UCIData data,String filename) throws RowsExceededException, WriteException {        try {            File file=new File(filename) ;            Workbook wb = Workbook.getWorkbook(file);            WritableWorkbook book = Workbook.createWorkbook(file,wb);            WritableSheet sheet = book.getSheet(0) ;            int rownum=data.Num+1;            int colnum=0 ;            jxl.write.Number num=new jxl.write.Number(colnum++,rownum,data.Num) ;            Label name=new Label(colnum++,rownum,data.Name);            Label associatetasks=new Label(colnum++,rownum,data.AssociatedTasks);            Label attritebutecharacters=new Label(colnum++,rownum,data.AttributeCharacteristics);            jxl.write.Number instance=new jxl.write.Number(colnum++,rownum,data.NumberofInstances) ;            jxl.write.Number attributes=new jxl.write.Number(colnum++,rownum,data.NumberofAttributes) ;            String datafull ;            if (data.DataFull==true) datafull="true" ; else datafull="false" ;            Label dataful=new Label(colnum++,rownum,datafull) ;            jxl.write.Number year=new jxl.write.Number(colnum++,rownum,data.Year) ;            jxl.write.Number hittimes=new jxl.write.Number(colnum++,rownum,data.HitTimes) ;            Label datasetinfo=new Label(colnum++,rownum,data.DataSetInformation);            Label attributeinfo=new Label(colnum++,rownum,data.AttributeInformation);            Label downlink=new Label(colnum++,rownum,data.DownloadLink);            sheet.addCell(num);            sheet.addCell(name);            sheet.addCell(associatetasks);            sheet.addCell(attritebutecharacters);            sheet.addCell(instance);            sheet.addCell(attributes);            sheet.addCell(dataful);            sheet.addCell(year);            sheet.addCell(hittimes);            sheet.addCell(datasetinfo);            sheet.addCell(attributeinfo);            sheet.addCell(downlink);                book.write();            book.close();        } catch (BiffException e) {            // TODO Auto-generated catch block            e.printStackTrace();        } catch (IOException e) {            // TODO Auto-generated catch block            e.printStackTrace();        }    }    public static void ExcelTitlePrint(String filename) throws RowsExceededException, WriteException{        try {            File file=new File(filename) ;            WritableWorkbook book=Workbook.createWorkbook(file);            WritableSheet sheet=book.createSheet("FirstPage",0) ;            String[] colname={"Num" ,"Name","AssociatedTasks",                            "AttributeCharacteristics","NumberOfInstances","NumberOfAttributes",                            "DataFull","Year","HitTimes",                            "DataSetInformation","AttributeInformation","DownloadLink"};            for(int i=0;i<colname.length;i++){                Label label=new Label(i,0,colname[i]) ;                sheet.addCell(label);            }            book.write();            book.close();        } catch (IOException e) {            // TODO Auto-generated catch block            System.out.println("ERROR：创建Excel文件失败");        }   //打开文件     }}
0 0