使用Jsoup采集拉勾网招聘信息

来源:互联网 发布:重庆大数据发展 编辑:程序博客网 时间:2024/04/29 10:53

本文使用Jsoup采集拉勾网招聘数据并写入CSV文件中,实现非常简单,在此不做多余的解释,如有问题可留言交流。


数据模型:job.java

package xyz.baal.jsoup;public class Job {private String jobname;//职位名称private String salary;//薪水private String place;//工作地点private String experience;//工作经验private String educational;//学历private String business;//业务private String stage;//发展阶段private String company;//公司名称public Job() {super();}public Job(String jobname, String salary, String place, String experience, String educational, String business,String stage, String company) {super();this.jobname = jobname;this.salary = salary;this.place = place;this.experience = experience;this.educational = educational;this.business = business;this.stage = stage;this.company = company;}public String getJobname() {return jobname;}public void setJobname(String jobname) {this.jobname = jobname;}public String getSalary() {return salary;}public void setSalary(String salary) {this.salary = salary;}public String getPlace() {return place;}public void setPlace(String place) {this.place = place;}public String getExperience() {return experience;}public void setExperience(String experience) {this.experience = experience;}public String getEducational() {return educational;}public void setEducational(String educational) {this.educational = educational;}public String getBusiness() {return business;}public void setBusiness(String business) {this.business = business;}public String getStage() {return stage;}public void setStage(String stage) {this.stage = stage;}public String getCompany() {return company;}public void setCompany(String company) {this.company = company;}@Overridepublic String toString() {return "Job [jobname=" + jobname + ", salary=" + salary + ", place=" + place + ", experience=" + experience+ ", educational=" + educational + ", business=" + business + ", stage=" + stage + ", company="+ company + "]";}}



写入CSV使用的是CSVReader,由于源文件中就CsvReader、CsvWriter两个文件,在这里直接引用了CsvWriter源文件(附:API文档)。



获取各个招聘职位首页链接,如java招聘链接为//www.lagou.com/zhaopin/Java/

package xyz.baal.jsoup;import java.io.File;import java.io.IOException;import java.util.ArrayList;import java.util.List;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;/** * 根据拉勾网首页获取各个招聘职位首页链接 *  * @author  * */public class GetZPURL {private List<String> zpURLlist = new ArrayList<String>();//存放各个招聘职位首页链接public GetZPURL(){super();}/** * 网络加载html文档 * @param url文档url */public void loadInternet(String url) {Document doc = null;try {doc = Jsoup.connect(url).userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.101 Safari/537.36").timeout(5000).get();} catch (IOException e) {System.out.println("获取招聘URL失败。");return;}Element content = doc.getElementById("container");Elements links = content.getElementsByTag("a");for (Element link : links) {String linkHref = link.attr("href");if(isZp(linkHref)){zpURLlist.add(linkHref);}}}/** * 从本地加载html文档 * @param path文档路径 * @param charset文档字符集 * @param baseURL基本url,当链接中存在相对路径时作为前缀 * @throws IOException文件不存在或无法读取时抛出此异常 */public void loadLocal(String path ,String charset, String baseURL) throws IOException {File input = new File(path);Document doc = Jsoup.parse(input, charset, baseURL);Element content = doc.getElementById("container");Elements links = content.getElementsByTag("a");for (Element link : links) {String linkHref = link.attr("href");if(isZp(linkHref)){zpURLlist.add(linkHref);}}}public boolean isZp(String url){if(url.indexOf("//www.lagou.com/zhaopin/")!=-1&&url.length()>24){return true;}else {return false;}}public List<String> getZpURLlist() {return zpURLlist;}}

获取某一招聘职位的30x15条数据,并写入CSV文件。

package xyz.baal.jsoup;import java.io.BufferedWriter;import java.io.File;import java.io.FileWriter;import java.io.IOException;import java.nio.charset.Charset;import java.util.ArrayList;import java.util.Iterator;import java.util.List;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;import com.csvreader.CsvWriter;/** * 获取拉勾网某一个职位的30x15条招聘信息 *  * @author *  */public class GetJob implements Runnable{private String zpUrl;// 某招聘职位对应的原始URLprivate List<String> zpUrlList = new ArrayList<String>(); // 每个分页对应的URLprivate List<String> jobUrlList = new ArrayList<String>();// 每条招聘信息对应的URLprivate List<Job> joblist = new ArrayList<Job>();// 存放30x15条招聘信息private static final String A_HREF = "//www.lagou.com/jobs/\\d+.html"; // href格式 //www.lagou.com/jobs/2350451.htmlprivate static final String PATH = "D:/"; // 文件存放路径private String jobName = "";//招聘职位名称/** *  * @param url 招聘职位首页url,如java、hadoop等招聘职位 */public GetJob(String url) {zpUrl = url;}/** * 在此方法内完成某一招聘职位的450条数据抓取 */public void init() {// 构建30个分页URLzpUrlList.add(zpUrl + "?filterOption=3");for (int i = 2; i <= 30; i++) {zpUrlList.add(zpUrl + i + "/?filterOption=3");}// 提取每个分页中的招聘信息URLfor (String string : zpUrlList) {Document doc = null;try {doc = Jsoup.connect("http:" + string).userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.101 Safari/537.36").timeout(5000).get();} catch (IOException e) {continue;}Element content = doc.getElementById("s_position_list");if (content == null) {continue;}Elements links = content.getElementsByTag("a");if (links == null) {continue;}for (Element link : links) {String linkHref = link.attr("href");Pattern pattern = Pattern.compile(A_HREF, Pattern.CASE_INSENSITIVE);Matcher matcher = pattern.matcher(linkHref);if (matcher.find()) {jobUrlList.add("http:" + linkHref);}}if (jobName == "") {jobName = doc.select("title").first().text().split("-")[0];}}// 根据招聘信息URL提取招聘详细信息for (String string : jobUrlList) {Job job = new Job();Document doc = null;try {doc = Jsoup.connect(string).userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.101 Safari/537.36").timeout(5000).get();job.setJobname(jobName);Element content = doc.getElementById("container");Element job_request = content.select(".job_request p").first();if (job_request != null) {if (job_request.child(0) != null) {job.setSalary(job_request.child(0).text());job.setPlace(job_request.child(1).text());job.setExperience(job_request.child(2).text());job.setEducational(job_request.child(3).text());} else {continue;}} else {continue;}Element cpy = doc.getElementById("job_company");if (cpy.childNodeSize()>=2) {job.setCompany(cpy.child(0).child(0).child(0).attr("alt"));job.setBusiness(cpy.child(1).child(0).child(0).ownText());job.setStage(cpy.child(1).child(2).child(0).ownText());} else {continue;}joblist.add(job);} catch (IOException e) {continue;}}}public List<Job> getJoblist() {return joblist;}/** * 将采集数据写入txt文件中 */public void writeTxtFile() {if (joblist.size() == 0 || joblist == null) {return;}File file = new File(PATH + joblist.get(0).getJobname() + ".txt");FileWriter fw = null;BufferedWriter bw = null;Iterator<Job> iter = joblist.iterator();try {fw = new FileWriter(file);bw = new BufferedWriter(fw);while (iter.hasNext()) {bw.write(iter.next().toString());bw.newLine();}bw.flush();} catch (Exception e) {e.printStackTrace();} finally {try {if (bw != null) {bw.close();}if (fw != null) {fw.close();}} catch (Exception e) {e.printStackTrace();}}}/** * 将采集数据写入CSV文件中 */public void writeCSVFile() {CsvWriter wr = null;if (joblist.size() == 0 || joblist == null) {return;}try {String csvFilePath = PATH + joblist.get(0).getJobname() + ".csv";wr = new CsvWriter(csvFilePath, ',', Charset.forName("GBK"));String[] header = { "职位名称", "薪水", "工作地点", "工作经验", "学历", "公司名称", "公司业务", "发展阶段"};wr.writeRecord(header);for (Job job : joblist) {String[] jobstr = { job.getJobname(), job.getSalary(), job.getPlace(), job.getExperience(),job.getEducational(), job.getCompany(), job.getBusiness(), job.getStage() };wr.writeRecord(jobstr);}} catch (IOException e) {e.printStackTrace();} finally {if (wr != null) {wr.close();}}}@Overridepublic void run() {init();writeCSVFile();System.out.println(jobName+"--End");}}

采集测试:

package xyz.baal.jsoup;import java.io.IOException;import java.util.ArrayList;import java.util.List;import java.util.concurrent.ArrayBlockingQueue;import java.util.concurrent.BlockingQueue;import java.util.concurrent.ThreadPoolExecutor;import java.util.concurrent.TimeUnit;public class Test {public static List<String> zpURLlist = new ArrayList<String>();public static void main(String[] args) throws IOException { //创建等待任务队列           BlockingQueue<Runnable> bqueue = new ArrayBlockingQueue<Runnable>(20);           //创建线程池,池中保存的线程数为3,池中允许的最大线程数为4          ThreadPoolExecutor pool = new ThreadPoolExecutor(3,4,50,TimeUnit.MILLISECONDS,bqueue);   Runnable job1 = new GetJob("//www.lagou.com/zhaopin/iOS/");Runnable job2 = new GetJob("//www.lagou.com/zhaopin/C/");Runnable job3 = new GetJob("//www.lagou.com/zhaopin/C++/");Runnable job4 = new GetJob("//www.lagou.com/zhaopin/Python/");Runnable job5 = new GetJob("//www.lagou.com/zhaopin/HTML5/");Runnable job6 = new GetJob("//www.lagou.com/zhaopin/webqianduan/");pool.execute(job1);pool.execute(job2);pool.execute(job3);pool.execute(job4);pool.execute(job5);pool.execute(job6);//关闭线程池pool.shutdown();}}


如需IP代理可在此网站寻找代理资源:http://www.xicidaili.com/

GitHub:点这里

2 0
原创粉丝点击