java-jsoup自适应爬取网页表格的内容

来源:互联网 发布:武汉程序员工资 编辑:程序博客网 时间:2024/05/16 17:15


  在爬取数据的过程中,我们有时候需要爬取页面中的表格 但表格的样式千变万化  下面的类和方法可以解析大部分的表格  得到 属性名 和 对应值.


  需要的包链接:

  http://download.csdn.net/detail/q383965374/5960953


类如下:


TestCrawTable  -----测试类

package com;import java.io.IOException;import java.util.List;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;public class TestCrawTable {public static void main(String[] args) {try {Document document = Jsoup.connect("http://www.landnj.cn/LandInfo.aspx?flag=gongshi&Id=172").get();if (document != null) {List<TableElement> tableElemts = DataTableUtil.getFitElement(document);if (tableElemts != null && tableElemts.size() > 0) {List<PropertyInfo> propertyInfos = TableUtil.extractPropertyInfos(tableElemts);if (propertyInfos != null && propertyInfos.size() > 0) {for (PropertyInfo propertyInfo : propertyInfos) {System.out.println(propertyInfo.getName() + "  "+ propertyInfo.getValue());}System.out.println("-----------------------------------");}}}} catch (IOException e) {e.printStackTrace();}}}


DataTableUtil --------验证是否是我们需要的table

package com;import java.util.ArrayList;import java.util.List;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.parser.Tag;import org.jsoup.select.Elements;public class DataTableUtil {private final static int NUM = 5;//要抓取的表格可能出现的属性名static String[] Propertys={"地块编号","宗地编号","地块位置","用地性质","规划面积","出让面积","发布时间","挂牌起始价","位置","交易时间","面积","规划用途","容积率","起价","成交价","交易方式","竞得人"};/** * 找到适合的装数据的TABle *  * @param location * @param document * @return *///取最里面的table进入isValueElement方法检测是不是我们需要的tablepublic static List<TableElement> getFitElement(Document document) {if (Propertys != null) {Element element = document.getElementsByTag("body").get(0);List<TableElement> fitElments = new ArrayList<TableElement>();Elements tableElements = element.getElementsByTag("table");if (tableElements != null && tableElements.size() > 0) {for (int i = 0; i < tableElements.size(); i++) {Element tableElement = tableElements.get(i);Elements ces = tableElement.getElementsByTag("table");if (ces != null && ces.size() > 1) {} else {TableElement te;if ((te = isValueElement(Propertys,tableElement)) != null) {fitElments.add(te);}}}} else {return null;}return fitElments;}return null;}//找出属性名所在的行,把属性名所在的行之上的信息去掉//ri为属性名所在行的结尾索引//row为属性名所在行占的行数//ri-row为属性名所在行的开始索引,这里取属性名所在行到表格的结尾行private static Element removeRedundance(String[] Propertys,Element element) {Elements tres = element.getElementsByTag("tr");Element trElement = tres.get(0);Elements tde = trElement.getElementsByTag("td");int row = 1;for (Element tdElement : tde) {String attribute = tdElement.attr("rowspan");if (attribute != null && !attribute.equals("")) {int rowSpan = Integer.valueOf(attribute);if (rowSpan > row) {row = rowSpan;}}}List<Element> elements = new ArrayList<Element>();for (int i = 0; i < row; i++) {elements.add(tres.get(i));}int ri = 0;while (!isValueElements(Propertys, elements)) {elements = new ArrayList<Element>();row = 1;Elements tdes = tres.get(ri).getElementsByTag("td");for (Element tdElement : tdes) {String attribute = tdElement.attr("rowspan");if (attribute != null && !attribute.equals("")) {int rowSpan = Integer.valueOf(attribute);if (rowSpan > row) {row = rowSpan;}}}for (int i = 0; i < row; i++) {elements.add(tres.get(ri + i));}ri = ri + row;}if (ri > 0) {Elements trs = element.getElementsByTag("tr");int size = trs.size();Element newElement = new Element(Tag.valueOf("table"), "table");for (int i = ri-row; i < size; i++) {newElement.appendChild(trs.get(i));}return newElement;}return element;}private static boolean isValueElements(String[] Propertys, List<Element> trElements) {int index = 0;int size = trElements.size();for (int i = 0; i < size; i++) {List<Element> propertyElements = new ArrayList<Element>();Element element = trElements.get(i);Elements tdElements = element.getElementsByTag("td");    Elements thElements = element.getElementsByTag("th");if(thElements!=null&&thElements.size()>0){for(Element thelement : thElements){propertyElements.add(thelement);}}if(tdElements!=null&&tdElements.size()>0){for(Element tdelement : tdElements){propertyElements.add(tdelement);}}for (Element tdElement : propertyElements) {String text = tdElement.text();if (!text.trim().equals("")) {String value = adjuestmentParm(text);if (value != null) {value = StringUtil.parseString(value);double max = 0.0d;for (int j = 0; j < Propertys.length; j++) {double temp = SimFeatureUtil.sim(Propertys[j], value);if (temp > max) {max = temp;}}if (max >= 0.6) {index++;}}}}}if (index >= NUM) {return true;} else {return false;}}private static boolean regual(String reg, String string) {Pattern pattern = Pattern.compile(reg);Matcher ma = pattern.matcher(string);if (ma.find()) {return true;}return false;}//清除获取的信息标题上的单位及一些字符提高匹配的相似度public static String adjuestmentParm(String parm) {if (regual("\\(", parm)) {parm = parm.substring(0, parm.indexOf("("));} else if (regual("(", parm)) {parm = parm.substring(0, parm.indexOf("("));} else if (regual("万元", parm)) {parm = parm.substring(0, parm.indexOf("万元"));} else if (regual("亩", parm)) {parm = parm.substring(0, parm.indexOf("亩"));} else if (regual("公顷", parm)) {parm = parm.substring(0, parm.indexOf("公顷"));} else if (regual("米", parm)) {parm = parm.substring(0, parm.indexOf("米"));}return parm;}//取出表格中的td或者th单元格的值进行相似度比较 //index记录相似值大于0.6的值个数  //consist记录连续不相似个数   //当连续不相似的个数达到10个时 就break 跳出循环//consistFlag记录连续相似个数//当index个数大于等于我们设定的个数时说明该table是包含我们要获取数据的table//当consistFlag大于2时说明表格内容是竖向的 setCross为false  //进入removeRedundance方法检查是否table中的前几行(tr)不是以标题行开始,把这几行去掉//例如:http://kmland.km.gov.cn/view.asp?id=7089&frist_lanmu=173private static TableElement isValueElement(String[] Propertys, Element element) {TableElement tableElement = new TableElement();List<Element> propertyElements = new ArrayList<Element>();Elements tdElements = element.getElementsByTag("td");    Elements thElements = element.getElementsByTag("th");if(thElements!=null&&thElements.size()>0){for(Element thelement : thElements){propertyElements.add(thelement);}}if(tdElements!=null&&tdElements.size()>0){for(Element tdelement : tdElements){propertyElements.add(tdelement);}}int index = 0;int consist = 0;int size = propertyElements.size();int consistFlag = 0;for (int i = 0; i < size; i++) {consist++;Element tdElement = propertyElements.get(i);String text = tdElement.text();if (!text.trim().equals("")) {String value = adjuestmentParm(text);if (value != null) {value = StringUtil.parseString(value);double max = 0.0d;for (int j = 0; j < Propertys.length; j++) {double temp = SimFeatureUtil.sim(Propertys[j], value);if (temp > max) {max = temp;}}if (max >= 0.6) {index++;if (consist == 1) {consist = 0;consistFlag++;} else {consist = 0;}} else {if (consist >= 10) {break;}}}}}if (index >= NUM) {tableElement.setWordNum(index);if (consistFlag >= 2) {tableElement.setElement(removeRedundance(Propertys, element));tableElement.setCross(false);} else {tableElement.setElement(element);}return tableElement;} else {return null;}}}

TableUtil -----解析表格
package com;import java.util.ArrayList;import java.util.List;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;public class TableUtil {//横向的表格进入getAcrossTableValue方法解析 竖向的表格进入getTableValue方法解析public static List<PropertyInfo> extractPropertyInfos(List<TableElement> tableElemts)  {List<PropertyInfo> propertyInfos = new ArrayList<PropertyInfo>();if(tableElemts!=null&&tableElemts.size()>0){for (TableElement element : tableElemts) {if(element.isCross()){propertyInfos.addAll(TableUtil.getAcrossTableValue(element.getElement()));}else{propertyInfos.addAll(TableUtil.getTableValue(element.getElement()));}}return propertyInfos;}return null;}//对于竖向内容的表格先取出属性名占的最大行数存为rowpublic static List<PropertyInfo> getTableValue(Element tableElement) {Elements trElements = tableElement.getElementsByTag("tr");int row = getRowElement(trElements.get(0));List<String> propertys = parseTablePropertys(row,trElements);return  parsePropertyValue(row, propertys, trElements);}/** * 提取TABLE属性 *  * @param tableElement *///把每行的元素存入ptdELementsList列表中,把每行的列数存入length 然后进入parsePropertyString函数对第一行内容提取得到属性名private static List<String> parseTablePropertys(int row,Elements trElements) {List<String> propertys = new ArrayList<String>();List<Elements> ptdELementsList = new ArrayList<Elements>();int[] lengths = new int[row]; // 储存每行的长度(列数-单元格个数)int[] index = new int[row]; // 每行位置for (int i = 0; i < row; i++) {Element trElement = trElements.get(i);Elements elements = new Elements();Elements tdElements = trElement.getElementsByTag("td");Elements thElements = trElement.getElementsByTag("th");if(tdElements!=null&&tdElements.size()>0){elements.addAll(tdElements);}if(thElements!=null&&thElements.size()>0){elements.addAll(thElements);}ptdELementsList.add(elements);lengths[i] = elements.size();}parsePropertyString(propertys, index, lengths, 0, ptdELementsList);return propertys;}//提取内容的值---遍历除属性行之外的行 依次提取td元素进入parseValues提取内容的值//flags记录行的状态//valueFlags记录每个属性一个值对多个单元格的状态//vtdElements记录每行单元格的元素td//size值的总行数//length记录每行的单元格td数private static List<PropertyInfo> parsePropertyValue(int row, List<String> propertys,Elements trElements) {int propertysSize = propertys.size();List<Elements> vtdElements = new ArrayList<Elements>();int trsize = trElements.size();int size = trsize - row;int[] lengths = new int[size];boolean[] flags = new boolean[size];boolean[][] valueFlags = new boolean[size][propertysSize];for (int i = row; i < trsize; i++) {Element trElement = trElements.get(i);Elements velements = new Elements();Elements tdElements = trElement.getElementsByTag("td");Elements thElements = trElement.getElementsByTag("th");if(thElements!=null&&thElements.size()>0){velements.addAll(thElements);}if(tdElements!=null&&tdElements.size()>0){velements.addAll(tdElements);}//Elements tdElements = trElement.getElementsByTag("td");if (velements != null && velements.size() > 0) {lengths[i-row] = velements.size();vtdElements.add(velements);for(int j = 0 ; j < propertysSize;j++){valueFlags[i-row][j]= false;}flags[i-row] = true;} else {size--;}}return parseValues(propertys, vtdElements, size, lengths, flags,valueFlags);}/** * 取TABLE里面的数据 包含 ROWSPAN 表明该元素对应多个PROPERTY 包含COLSPAN表明该元素对应多条数据 *  * @param propertysSize * @param vtdElements * @param size * @param lengths * @param flags * @param valueFlags *///一个单元格的内容对应多条记录的 例子:http://kmland.km.gov.cn/view.asp?id=7089&frist_lanmu=173//遇到一对多的值 则取出它的rowspan和colspan的,把对应位置的值赋值,并修改该位置的valueFlag为trueprivate static List<PropertyInfo> parseValues(List<String> propertys,List<Elements> vtdElements, int size, int[] lengths,boolean[] flags, boolean[][] valueFlags) {int propertysSize = propertys.size();String[][] pValueStrs = new String[size][propertysSize];for (int i = 0; i < size; i++) {Elements tdValueElements = vtdElements.get(i);int k = 0;for (int j = 0; j < lengths[i]; j++) {Element tdValueElement = null;try {tdValueElement = tdValueElements.get(j);} catch (Exception e) {flags[i] = false;break;}if(valueFlags[i][k]){while (valueFlags[i][k]) {k++;}}if (tdValueElement.hasAttr("rowspan")) {int rowspan = Integer.valueOf(tdValueElement.attr("rowspan"));int colspan = 1;if (tdValueElement.hasAttr("colspan")) {colspan = Integer.valueOf(tdValueElement.attr("colspan"));}if (rowspan > 1) {System.out.println(rowspan);for (int m = 0; m < rowspan; m++) {for (int n = 0; n < colspan; n++) {System.out.println("i = " + i +"m=" +m);try {valueFlags[i + m][k + n] = true;pValueStrs[i + m][k + n] = tdValueElement.text();} catch (Exception e) {System.out.println("i = " + i +"m=" +m +"k+n = "+ (k+n)+ "length");}}}}}else if (tdValueElement.hasAttr("colspan")) {int colspan = Integer.valueOf(tdValueElement.attr("colspan"));if (colspan > 1) {if (colspan >= size - 1) {flags[i] = false;break;}for (int m = 0; m < colspan; m++) {valueFlags[i][k] = true;pValueStrs[i+m][k] = tdValueElement.text();}}}else{if(tdValueElement!=null){pValueStrs[i][k] = tdValueElement.text();}}k++;}}List<PropertyInfo> propertyInfos = new ArrayList<PropertyInfo>();for (int i = 0; i < size; i++) {if (flags[i]) {for (int j = 0; j < propertysSize; j++) {if(propertys.get(j)!=null){PropertyInfo propertyValue = new PropertyInfo();propertyValue.setName(propertys.get(j));propertyValue.setValue(pValueStrs[i][j]);propertyInfos.add(propertyValue);}}}}return propertyInfos;}public static List<PropertyInfo> getAcrossTableValue(Element element){//Elements tdElements = element.getElementsByTag("td");Elements trElements = element.getElementsByTag("tr");List<PropertyInfo> propertyValues = new ArrayList<PropertyInfo>();for(Element trElement:trElements){Elements tdElements = trElement.getElementsByTag("td");int size = tdElements.size();if(size % 2 ==0){PropertyInfo propertyValue = null;for (int i = 0; i < size; i++) {Element tdElement = tdElements.get(i);String value = tdElement.text();if (i % 2 == 0) {propertyValue = new PropertyInfo();propertyValue.setName(StringUtil.parseString(value));} else {propertyValue.setValue(value);propertyValues.add(propertyValue);}}}}return propertyValues;}/** * 提取属性名 *  * @param propertyStrs * @param index * @param lengths * @param ind * @param tdElementsList *///遍历属性行中的td,取它的colspan,如果它横跨的大于1则说明它是包含多个小标题的大标题,则进入第二行取值private static void parsePropertyString(List<String> propertyStrs,int[] index, int[] lengths, int ind, List<Elements> tdElementsList) {Elements tdElements = tdElementsList.get(ind);for (int i = index[ind]; i < lengths[ind]; i++) {index[ind] = index[ind] + 1;Element tdElement = tdElements.get(i);String attribute = tdElement.attr("colspan");String value = StringUtil.parseString(tdElement.text());//if (!value.trim().equals("")) {if (attribute != null && !attribute.equals("")) {int col = Integer.valueOf(attribute);if (col > 1) {parsePropertyString(propertyStrs, index, lengths,ind + 1, tdElementsList);} else {propertyStrs.add(value);}} else {propertyStrs.add(value);}//}}}/** * 提取属性占的ROW数 取出属性中含有的最大ROWSPAN数 *  * @param trElement * @return */private static int getRowElement(Element trElement) {Elements tdElements = trElement.getElementsByTag("td");int row = 1;for (Element tdElement : tdElements) {String attribute = tdElement.attr("rowspan");if (attribute != null && !attribute.equals("")) {int rowSpan = Integer.valueOf(attribute);if (rowSpan > row) {row = rowSpan;}}}return row;}}

下面是一些工具类和元素类

PropertyInfo

package com;public class PropertyInfo {private String name;private String value;public String getName() {return name;}public void setName(String name) {this.name = name;}public String getValue() {return value;}public void setValue(String value) {this.value = value;}}

SimFeatureUtil 相似度对比

package com;public class SimFeatureUtil {private static int min(int one, int two, int three) {int min = one;if (two < min) {min = two;}if (three < min) {min = three;}return min;}public static int ld(String str1, String str2) {int d[][]; // 矩阵int n = str1.length();int m = str2.length();int i; // 遍历str1的int j; // 遍历str2的char ch1; // str1的char ch2; // str2的int temp; // 记录相同字符,在某个矩阵位置值的增量,不是0就是1if (n == 0) {return m;}if (m == 0) {return n;}d = new int[n + 1][m + 1];for (i = 0; i <= n; i++) { // 初始化第一列d[i][0] = i;}for (j = 0; j <= m; j++) { // 初始化第一行d[0][j] = j;}for (i = 1; i <= n; i++) { // 遍历str1ch1 = str1.charAt(i - 1);// 去匹配str2for (j = 1; j <= m; j++) {ch2 = str2.charAt(j - 1);if (ch1 == ch2) {temp = 0;} else {temp = 1;}// 左边+1,上边+1, 左上角+temp取最小d[i][j] = min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1]+ temp);}}return d[n][m];}public static double sim(String str1, String str2) {try {double ld = (double)ld(str1, str2);return (1-ld/(double)Math.max(str1.length(), str2.length()));//double length = (double)str1.length() /(double) str2.length();//return (1 - (double) ld / Math.max(str1.length(), str2.length()))* length;} catch (Exception e) {return 0.1;}}public static void main(String[] args) {String str1 = "地块编号";String str2 = "地块编号";System.out.println("ld=" + ld(str1, str2));System.out.println("sim=" + sim(str1, str2));}}


StringUtil 去除多余字符

package com;import java.util.regex.Matcher;import java.util.regex.Pattern;public class StringUtil {private static Pattern pattern = Pattern.compile("[\\u4e00-\\u9fa5a-zA-Z\\d\\../:]");/** * 去掉多余字符 *  * @return */public static String parseString(String str) {StringBuilder sb = new StringBuilder();Matcher matcher = pattern.matcher(str);while (matcher.find()) {sb.append(matcher.group());}return sb.toString();}}

TableElement
package com;import org.jsoup.nodes.Element;public class TableElement {private Element element;private boolean isCross;private int wordNum;public TableElement(){isCross = true;}public Element getElement() {return element;}public void setElement(Element element) {this.element = element;}public boolean isCross() {return isCross;}public void setCross(boolean isCross) {this.isCross = isCross;}public int getWordNum() {return wordNum;}public void setWordNum(int wordNum) {this.wordNum = wordNum;}}

提取的表格:




运行结果:



完整的例子下载:

java爬取网页表格的例子(运行环境myeclipse)




0 0
原创粉丝点击