原来java的正则也很强大,搜索html文档,根据要求替换img标签中的src属性

来源:互联网 发布:巴克敏斯特·富勒 知乎 编辑:程序博客网 时间:2024/04/28 21:07

以前一直以为java的正则太弱了,到今天才发现是自己太弱了,下面记录一个比较复杂的匹配过程:搜索html文档,根据要求替换img标签中的src属性,进行这个操作的目的是为了在发送带图片的邮件时将附件与图片通过邮件头属性Content-ID联系起来

这里面涵盖了java正则的分组和特殊标记(有对应的字段,比如DOTALL,即换行匹配)


regex.txt是我们需要进行匹配的对象,它的内容为:

<p><img style="WIDTH: 704px; HEIGHT: 379px" alt="acb" width="1075" height="593" src="/sale/common/FCKeditor/UserFilesImage/a.jpg" /></p><p><img style="WIDTH: 704px; HEIGHT: 379px" alt="acddb" width="1075" height="593" src="/sale/common/FCKeditor/UserFilesImage/ade.jpg" /></p><img alt="" src="/sale/cms/simpleDownload?fileId=297e022a3bb3d941013bb626e75d0047" /></div><div>--<br /><p><img alt="" src="/sale/common/FCKeditor/editor/images/smiley/msn/regular_smile.gif" /><img style="WIDTH: 704px; HEIGHT: 379px" alt="acb" width="1075" height="593" src="/sale/common/FCKeditor/UserFilesImage/a.jpg" /></p><img src="www.baidu.com" />


然后是整个java处理的源文件:


package com.liyzh.regex;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;import java.util.UUID;import java.util.regex.Matcher;import java.util.regex.Pattern;import com.liyzh.file.ReadFromFile;public class ReplaceImgTag {public static void main(String[] args) {String fileName = "c:/regex.txt";Map cidMap = new HashMap<String, String>();List fckImages = new ArrayList<String>();String str = ReadFromFile.readFile(fileName).toString();String newStr = str;Pattern pattern = Pattern.compile("(?s)<img.*?(src=\"(.*?((common/FCKeditor.*?)|(cms/simpleDownload\\?fileId=([0-9a-zA-Z]*))))\")");/* * (?s)//换行匹配开关,对应Pattern类中的字段DOTALL; *  * <img.*?(src=\"(.*?((common/FCKeditor.*?)|(cms/simpleDownload \\?fileId=([0-9a-zA-Z]*))))\")//整个img标签,group0 *  * .*?//表示非贪婪匹配,找到最近的就停止 *  * (src=\"(.*?((common/FCKeditor.*?)|(cms/simpleDownload \\?fileId=([0-9a-zA-Z]*))))\")//整个src属性,group1 *  * (.*?((common/FCKeditor.*?)|(cms/simpleDownload \\?fileId=([0-9a-zA-Z]*))))//src属性的值,(包括url上下文,即应该根路径,eg:fms),group2 *  * ((common/FCKeditor.*?)|(cms/simpleDownload \\?fileId=([0-9a-zA-Z]*)))//src属性的值去掉url上下文,即 除去上下文的服务器web下相对路径,group3 *  * //group4和group5分别是用来匹配fck上传文件和从收件箱转发时原本就有的附件,group3的结果是在4和5之间二选一 *  * ([0-9a-zA-Z]*)//group6用来匹配group5中附件id */Matcher matcher = pattern.matcher(newStr);while (matcher.find()) {String img = matcher.group(0);String src = matcher.group(1);String url = matcher.group(2);String g3_fckUrl = matcher.group(3);// 除去上下文的服务器web下相对路径String fileId = matcher.group(6);// ([0-9a-zA-Z]*)if (fileId != null) {cidMap.put(fileId, "true");String newUrl = "cid:" + fileId;img = img.replace(url, newUrl);} else {String cid = UUID.randomUUID().toString();String newUrl = "cid:" + cid;img = img.replace(url, newUrl);fckImages.add(g3_fckUrl);}System.out.println("group0--img tag is : ");System.out.println(img);System.out.println("group1--src attr is : ");System.out.println("\t" + src);System.out.println("group2--url is : ");System.out.println("\t" + url);System.out.println("group345--fckUrl or attUrl is : ");System.out.println("\t" + g3_fckUrl);System.out.println("group6--fileId is : ");System.out.println("\t" + fileId);System.out.println();}cidMap.put("fckImages", fckImages);System.out.println(cidMap);System.out.println(fckImages);}}



最后是匹配的输出结果:


group0--img tag is : <img style="WIDTH: 704px; HEIGHT: 379px" alt="acb" width="1075" height="593" src="cid:a70efa3e-ede9-4143-9110-8288f4592cf4"group1--src attr is : src="/sale/common/FCKeditor/UserFilesImage/a.jpg"group2--url is : /sale/common/FCKeditor/UserFilesImage/a.jpggroup345--fckUrl or attUrl is : common/FCKeditor/UserFilesImage/a.jpggroup6--fileId is : nullgroup0--img tag is : <img style="WIDTH: 704px; HEIGHT: 379px" alt="acddb" width="1075" height="593" src="cid:e3e6023d-55b9-4970-84ed-c5b0aa6f20e1"group1--src attr is : src="/sale/common/FCKeditor/UserFilesImage/ade.jpg"group2--url is : /sale/common/FCKeditor/UserFilesImage/ade.jpggroup345--fckUrl or attUrl is : common/FCKeditor/UserFilesImage/ade.jpggroup6--fileId is : nullgroup0--img tag is : <img alt="" src="cid:297e022a3bb3d941013bb626e75d0047"group1--src attr is : src="/sale/cms/simpleDownload?fileId=297e022a3bb3d941013bb626e75d0047"group2--url is : /sale/cms/simpleDownload?fileId=297e022a3bb3d941013bb626e75d0047group345--fckUrl or attUrl is : cms/simpleDownload?fileId=297e022a3bb3d941013bb626e75d0047group6--fileId is : 297e022a3bb3d941013bb626e75d0047group0--img tag is : <img alt="" src="cid:8d69ff7b-a162-4103-9009-10306dc6c9bc"group1--src attr is : src="/sale/common/FCKeditor/editor/images/smiley/msn/regular_smile.gif"group2--url is : /sale/common/FCKeditor/editor/images/smiley/msn/regular_smile.gifgroup345--fckUrl or attUrl is : common/FCKeditor/editor/images/smiley/msn/regular_smile.gifgroup6--fileId is : nullgroup0--img tag is : <img style="WIDTH: 704px; HEIGHT: 379px" alt="acb" width="1075" height="593" src="cid:635c354e-80f3-4e58-b3eb-783ca7e97a8d"group1--src attr is : src="/sale/common/FCKeditor/UserFilesImage/a.jpg"group2--url is : /sale/common/FCKeditor/UserFilesImage/a.jpggroup345--fckUrl or attUrl is : common/FCKeditor/UserFilesImage/a.jpggroup6--fileId is : null{297e022a3bb3d941013bb626e75d0047=true, fckImages=[common/FCKeditor/UserFilesImage/a.jpg, common/FCKeditor/UserFilesImage/ade.jpg, common/FCKeditor/editor/images/smiley/msn/regular_smile.gif, common/FCKeditor/UserFilesImage/a.jpg]}[common/FCKeditor/UserFilesImage/a.jpg, common/FCKeditor/UserFilesImage/ade.jpg, common/FCKeditor/editor/images/smiley/msn/regular_smile.gif, common/FCKeditor/UserFilesImage/a.jpg]


后来觉得上面的正则写在一行太痛苦了,又做了一下拆分,

package com.liyzh.regex;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;import java.util.UUID;import java.util.regex.Matcher;import java.util.regex.Pattern;import com.liyzh.file.ReadFromFile;public class ReplaceImgTag {public static void main(String[] args) {String fileName = "c:/regex.txt";Map cidMap = new HashMap<String, String>();List fckImages = new ArrayList<String>();String str = ReadFromFile.readFile(fileName).toString();String newStr = str;/* * (?s)//换行匹配开关,对应Pattern类中的字段DOTALL; *  * .*?//表示非贪婪匹配,找到最近的就停止 */String group5 = "([0-9a-zA-Z_]*\\.[0-9a-zA-Z]*)";// group5用来匹配group4中文件名String group4 = "(common/FCKeditor.*?/" + group5 + ")";// group4String group7 = "([0-9a-zA-Z]*)";// group7用来匹配group6中附件idString group6 = "(cms/simpleDownload\\?fileId=" + group7 + ")";// group6String group3 = "(" + group4 + "|" + group6 + ")";// src属性的值去掉url上下文,即 除去上下文的服务器web下相对路径,group3String group2 = "(.*?" + group3 + ")";// src属性的值,(包括url上下文,即应该根路径,eg:fms),group2String group1 = "(src=\"" + group2 + ")";// 整个src属性,group1String group0 = "(?s)<img.*?" + group1;// 整个img标签,group0Pattern pattern = Pattern.compile(group0);Matcher matcher = pattern.matcher(newStr);int count = 0;while (matcher.find()) {String img = matcher.group(0);String src = matcher.group(1);String url = matcher.group(2);String localUrl = matcher.group(3);// 除去上下文的服务器web下相对路径String imgName = matcher.group(5);String fileId = matcher.group(7);// ([0-9a-zA-Z]*)if (fileId != null) {cidMap.put(fileId, "true");String newUrl = "cid:" + fileId;img = img.replace(url, newUrl);} else {String cid = UUID.randomUUID().toString();String newUrl = "cid:" + cid;// img = img.replace(url, newUrl);img = "<img src=\"" + newUrl + "\" />";fckImages.add(localUrl);}System.out.println("\t\t\t\tMatch count is : " + count++);System.out.println("group0--img tag is : ");System.out.println(img);System.out.println("group1--src attr is : ");System.out.println("\t" + src);System.out.println("group2--url is : ");System.out.println("\t" + url);System.out.println("group346--fckUrl or attUrl is : ");System.out.println("\t" + localUrl);System.out.println("group5--imgName is : ");System.out.println("\t" + imgName);System.out.println("group7--fileId is : ");System.out.println("\t" + fileId);System.out.println();}cidMap.put("fckImages", fckImages);System.out.println(cidMap);System.out.println(fckImages);}}