一个小爬虫和正则表达式的例子,用于获取2015年迅雷校招的笔试名单

来源:互联网 发布:hishop 上传商品数据 编辑:程序博客网 时间:2024/06/06 05:21

笔记:

jsoup.jar包常用语制作网页爬虫,它的使用只需要导入jsoup.jar这一个包就行,它的使用参考点击打开链接。通常解析web也伴随着正则表达式的使用,正则表达式group的概念参考点击打开链接

import java.io.IOException;import java.lang.reflect.Array;import java.util.ArrayList;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;public class JsoupTest {  int count;int OPM;int Cplusplus;int SystemOperationEngineer;int  DataDevelopmentEngineer;int PM;int VisualDesigner;int  WebReconstructionEngineer;int InteractionDesigner;int BusinessSpecialist;int city;ArrayList <WHUT> WHUTStudent;         public  void get(int city) throws IOException  {            count=1;            OPM=0;            Cplusplus=0;            SystemOperationEngineer=0;            DataDevelopmentEngineer=0;            PM=0;            VisualDesigner=0;            WebReconstructionEngineer=0;            InteractionDesigner=0;            BusinessSpecialist=0;            this.city=city;             WHUTStudent=new ArrayList <WHUT>();              Document doc = Jsoup.connect("http://svr.campus.xunlei.com/viewlist?callback=jQuery110209096807448659092_1444288837792&city="+city+"&from=0&to=100000&name=-1&_=0").get();                String value=doc.toString();                 Pattern pattern_name = Pattern.compile(""name":"(.+?)",");                 Matcher macher_name = pattern_name.matcher(value);                Pattern pattern_position = Pattern.compile("position":"(.+?)",");                Matcher macher_position = pattern_position.matcher(value);                Pattern pattern_school = Pattern.compile("school":"(.+?)",");                Matcher macher_school = pattern_school.matcher(value);                Pattern pattern_major = Pattern.compile("major":"(.+?)",");                Matcher macher_major = pattern_major.matcher(value);                Pattern pattern_time = Pattern.compile("time":"(.+?)",");                Matcher macher_time = pattern_time.matcher(value);                 while(macher_name.find()&&macher_position.find()&&macher_school.find()&&macher_major.find()&&macher_time.find())        {                if(city==-1){        System.out.println("编号:"+count);        System.out.println(macher_name.group(1));        System.out.println(macher_position.group(1));        System.out.println(macher_school.group(1));        System.out.println(macher_major.group(1));        System.out.println(macher_time.group(1));        System.out.println("************************************");        }                count++;                if(macher_position.group(1).equals("运营产品经理"))OPM++;        if(macher_position.group(1).equals("C++开发工程师"))Cplusplus++;                if(macher_position.group(1).equals("系统运维工程师"))SystemOperationEngineer++;        if(macher_position.group(1).equals("数据开发工程师"))DataDevelopmentEngineer++;                if(macher_position.group(1).equals("产品经理"))PM++;        if(macher_position.group(1).equals("视觉设计师"))VisualDesigner++;        if(macher_position.group(1).equals("网页重构工程师"))WebReconstructionEngineer++;        if(macher_position.group(1).equals("交互设计师")) InteractionDesigner++;        if(macher_position.group(1).equals("商务专员")) BusinessSpecialist++;                if(macher_school.group(1).equals("武汉理工大学")) {                WHUT tmp=new WHUT();                tmp.name=macher_name.group(1);                tmp.position=macher_position.group(1);                tmp.school=macher_school.group(1);                tmp.major=macher_major.group(1);                tmp.time=macher_time.group(1);                WHUTStudent.add(tmp);                }        }                        }          public void print(){    switch(city){        case -1:System.out.println("全国:");break;        case 12:System.out.println("西安:");break;        case 11:System.out.println("成都:");break;        case 8:System.out.println("武汉:");break;        case 4: System.out.println("广州:");break;    }                System.out.println("商务专员:"+BusinessSpecialist+"人");    System.out.println("产品经理:"+PM+"人");    System.out.println("视觉设计师:"+VisualDesigner+"人");    System.out.println("交互设计师:"+InteractionDesigner+"人");    System.out.println("运营产品经理:"+OPM+"人");    System.out.println("C++开发工程师:"+Cplusplus+"人");    System.out.println("网页重构工程师:"+WebReconstructionEngineer+"人");    System.out.println("系统运维工程师:"+SystemOperationEngineer+"人");    System.out.println("数据开发工程师:"+DataDevelopmentEngineer+"人");    System.out.println("************************************");        if(city==8){        System.out.println("其中武汉理工大学的学生有:"+WHUTStudent.size()+"人");        for(int i=0;i<WHUTStudent.size();i++)        {System.out.println(WHUTStudent.get(i).name);    System.out.println(WHUTStudent.get(i).position);    System.out.println(WHUTStudent.get(i).major);    System.out.println(WHUTStudent.get(i).school);        System.out.println("#########################");    }        System.out.println("************************************");        }    }public static void main(String[] args) throws Exception {          JsoupTest  wholeCountry=new  JsoupTest (); wholeCountry.get(-1);//全国 wholeCountry.print(); wholeCountry.get(12);//西安 wholeCountry.print(); wholeCountry.get(11);//成都 wholeCountry.print(); wholeCountry.get(8);//武汉 wholeCountry.print(); wholeCountry.get(4);//广州 wholeCountry.print();          }}

public class WHUT {String name;String position; String school;String major;String time ; }


                                             
0 0
原创粉丝点击