HTML网页解码类(JAVA实现)

来源:互联网 发布:iphoto for mac下载 编辑:程序博客网 时间:2024/06/17 14:06

本类的主要作用,就是将那些HTML编码后的字符,转回本来对应的字符。

尤其适合于一些特殊的符号。大部分以&开头。

package demo;

import java.util.HashMap;

public class HTMLDecoder {

    
public static String decode(String s) {
        String t;
        Character ch;
        
int tmpPos, i;

        
int maxPos = s.length();
        StringBuffer sb 
= new StringBuffer(maxPos);
        
int curPos = 0;
        
while (curPos < maxPos) {
            
char c = s.charAt(curPos++);
            
if (c == '&'{
                tmpPos 
= curPos;
                
if (tmpPos < maxPos) {
                    
char d = s.charAt(tmpPos++);
                    
if (d == '#'{
                        
if (tmpPos < maxPos) {
                            d 
= s.charAt(tmpPos++);
                            
if ((d == 'x'|| (d == 'X')) {
                                
if (tmpPos < maxPos) {
                                    d 
= s.charAt(tmpPos++);
                                    
if (isHexDigit(d)) {
                                        
while (tmpPos < maxPos) {
                                            d 
= s.charAt(tmpPos++);
                                            
if (!isHexDigit(d)) {
                                                
if (d == ';'{
                                                    t 
= s.substring(curPos + 2,
                                                            tmpPos 
- 1);
                                                    
try {
                                                        i 
= Integer.parseInt(t,
                                                                
16);
                                                        
if ((i >= 0)
                                                                
&& (i < 65536)) {
                                                            c 
= (char) i;
                                                            curPos 
= tmpPos;
                                                        }

                                                    }
 catch (NumberFormatException e) {
                                                    }

                                                }

                                                
break;
                                            }

                                        }

                                    }

                                }

                            }
 else if (isDigit(d)) {
                                
while (tmpPos < maxPos) {
                                    d 
= s.charAt(tmpPos++);
                                    
if (!isDigit(d)) {
                                        
if (d == ';'{
                                            t 
= s.substring(curPos + 1,
                                                    tmpPos 
- 1);
                                            
try {
                                                i 
= Integer.parseInt(t);
                                                
if ((i >= 0&& (i < 65536)) {
                                                    c 
= (char) i;
                                                    curPos 
= tmpPos;
                                                }

                                            }
 catch (NumberFormatException e) {
                                            }

                                        }

                                        
break;
                                    }

                                }

                            }

                        }

                    }
 else if (isLetter(d)) {
                        
while (tmpPos < maxPos) {
                            d 
= s.charAt(tmpPos++);
                            
if (!isLetterOrDigit(d)) {
                                
if (d == ';'{
                                    t 
= s.substring(curPos, tmpPos - 1);
                                    ch 
= (Character) charTable.get(t);
                                    
if (ch != null{
                                        c 
= ch.charValue();
                                        curPos 
= tmpPos;
                                    }

                                }

                                
break;
                            }

                        }

                    }

                }

            }

            sb.append(c);
        }

        
return sb.toString();
    }


    
private static boolean isLetterOrDigit(char c) {
        
return isLetter(c) || isDigit(c);
    }


    
private static boolean isHexDigit(char c) {
        
return isHexLetter(c) || isDigit(c);
    }


    
private static boolean isLetter(char c) {
        
return ((c >= 'a'&& (c <= 'z')) || ((c >= 'A'&& (c <= 'Z'));
    }


    
private static boolean isHexLetter(char c) {
        
return ((c >= 'a'&& (c <= 'f')) || ((c >= 'A'&& (c <= 'F'));
    }


    
private static boolean isDigit(char c) {
        
return (c >= '0'&& (c <= '9');
    }


    
public static String compact(String s) {
        
int maxPos = s.length();
        StringBuffer sb 
= new StringBuffer(maxPos);
        
int curPos = 0;
        
while (curPos < maxPos) {
            
char c = s.charAt(curPos++);
            
if (isWhitespace(c)) {
                
while ((curPos < maxPos) && isWhitespace(s.charAt(curPos))) {
                    curPos
++;
                }

                c 
= ' ';
            }

            sb.append(c);
        }

        
return sb.toString();
    }


    
// HTML is very particular about what constitutes white space.
    public static boolean isWhitespace(char ch) {
        
return (ch == ' '|| (ch == ' '|| (ch == ' ')
                
|| (ch == ' '|| (ch == ' '|| (ch == '');
    }


    
public static final HashMap<String, Character> charTable;
    
    
static {
        charTable 
= new HashMap<String, Character>();
        charTable.put(
"quot"new Character((char34));
        charTable.put(
"amp"new Character((char38));
        charTable.put(
"apos"new Character((char39));
        charTable.put(
"lt"new Character((char60));
        charTable.put(
"gt"new Character((char62));
        charTable.put(
"nbsp"new Character((char160));
        charTable.put(
"iexcl"new Character((char161));
        charTable.put(
"cent"new Character((char162));
        charTable.put(
"pound"new Character((char163));
        charTable.put(
"curren"new Character((char164));
        charTable.put(
"yen"new Character((char165));
        charTable.put(
"brvbar"new Character((char166));
        charTable.put(
"sect"new Character((char167));
        charTable.put(
"uml"new Character((char168));
        charTable.put(
"copy"new Character((char169));
        charTable.put(
"ordf"new Character((char170));
        charTable.put(
"laquo"new Character((char171));
        charTable.put(
"not"new Character((char172));
        charTable.put(
"shy"new Character((char173));
        charTable.put(
"reg"new Character((char174));
        charTable.put(
"macr"new Character((char175));
        charTable.put(
"deg"new Character((char176));
        charTable.put(
"plusmn"new Character((char177));
        charTable.put(
"sup2"new Character((char178));
        charTable.put(
"sup3"new Character((char179));
        charTable.put(
"acute"new Character((char180));
        charTable.put(
"micro"new Character((char181));
        charTable.put(
"para"new Character((char182));
        charTable.put(
"middot"new Character((char183));
        charTable.put(
"cedil"new Character((char184));
        charTable.put(
"sup1"new Character((char185));
        charTable.put(
"ordm"new Character((char186));
        charTable.put(
"raquo"new Character((char187));
        charTable.put(
"frac14"new Character((char188));
        charTable.put(
"frac12"new Character((char189));
        charTable.put(
"frac34"new Character((char190));
        charTable.put(
"iquest"new Character((char191));
        charTable.put(
"Agrave"new Character((char192));
        charTable.put(
"Aacute"new Character((char193));
        charTable.put(
"Acirc"new Character((char194));
        charTable.put(
"Atilde"new Character((char195));
        charTable.put(
"Auml"new Character((char196));
        charTable.put(
"Aring"new Character((char197));
        charTable.put(
"AElig"new Character((char198));
        charTable.put(
"Ccedil"new Character((char199));
        charTable.put(
"Egrave"new Character((char200));
        charTable.put(
"Eacute"new Character((char201));
        charTable.put(
"Ecirc"new Character((char202));
        charTable.put(
"Euml"new Character((char203));
        charTable.put(
"Igrave"new Character((char204));
        charTable.put(
"Iacute"new Character((char205));
        charTable.put(
"Icirc"new Character((char206));
        charTable.put(
"Iuml"new Character((char207));
        charTable.put(
"ETH"new Character((char208));
        charTable.put(
"Ntilde"new Character((char209));
        charTable.put(
"Ograve"new Character((char210));
        charTable.put(
"Oacute"new Character((char211));
        charTable.put(
"Ocirc"new Character((char212));
        charTable.put(
"Otilde"new Character((char213));
        charTable.put(
"Ouml"new Character((char214));
        charTable.put(
"times"new Character((char215));
        charTable.put(
"Oslash"new Character((char216));
        charTable.put(
"Ugrave"new Character((char217));
        charTable.put(
"Uacute"new Character((char218));
        charTable.put(
"Ucirc"new Character((char219));
        charTable.put(
"Uuml"new Character((char220));
        charTable.put(
"Yacute"new Character((char221));
        charTable.put(
"THORN"new Character((char222));
        charTable.put(
"szlig"new Character((char223));
        charTable.put(
"agrave"new Character((char224));
        charTable.put(
"aacute"new Character((char225));
        charTable.put(
"acirc"new Character((char226));
        charTable.put(
"atilde"new Character((char227));
        charTable.put(
"auml"new Character((char228));
        charTable.put(
"aring"new Character((char229));
        charTable.put(
"aelig"new Character((char230));
        charTable.put(
"ccedil"new Character((char231));
        charTable.put(
"egrave"new Character((char232));
        charTable.put(
"eacute"new Character((char233));
        charTable.put(
"ecirc"new Character((char234));
        charTable.put(
"euml"new Character((char235));
        charTable.put(
"igrave"new Character((char236));
        charTable.put(
"iacute"new Character((char237));
        charTable.put(
"icirc"new Character((char238));
        charTable.put(
"iuml"new Character((char239));
        charTable.put(
"eth"new Character((char240));
        charTable.put(
"ntilde"new Character((char241));
        charTable.put(
"ograve"new Character((char242));
        charTable.put(
"oacute"new Character((char243));
        charTable.put(
"ocirc"new Character((char244));
        charTable.put(
"otilde"new Character((char245));
        charTable.put(
"ouml"new Character((char246));
        charTable.put(
"divide"new Character((char247));
        charTable.put(
"oslash"new Character((char248));
        charTable.put(
"ugrave"new Character((char249));
        charTable.put(
"uacute"new Character((char250));
        charTable.put(
"ucirc"new Character((char251));
        charTable.put(
"uuml"new Character((char252));
        charTable.put(
"yacute"new Character((char253));
        charTable.put(
"thorn"new Character((char254));
        charTable.put(
"yuml"new Character((char255));
        charTable.put(
"OElig"new Character((char338));
        charTable.put(
"oelig"new Character((char339));
        charTable.put(
"Scaron"new Character((char352));
        charTable.put(
"scaron"new Character((char353));
        charTable.put(
"fnof"new Character((char402));
        charTable.put(
"circ"new Character((char710));
        charTable.put(
"tilde"new Character((char732));
        charTable.put(
"Alpha"new Character((char913));
        charTable.put(
"Beta"new Character((char914));
        charTable.put(
"Gamma"new Character((char915));
        charTable.put(
"Delta"new Character((char916));
        charTable.put(
"Epsilon"new Character((char917));
        charTable.put(
"Zeta"new Character((char918));
        charTable.put(
"Eta"new Character((char919));
        charTable.put(
"Theta"new Character((char920));
        charTable.put(
"Iota"new Character((char921));
        charTable.put(
"Kappa"new Character((char922));
        charTable.put(
"Lambda"new Character((char923));
        charTable.put(
"Mu"new Character((char924));
        charTable.put(
"Nu"new Character((char925));
        charTable.put(
"Xi"new Character((char926));
        charTable.put(
"Omicron"new Character((char927));
        charTable.put(
"i"new Character((char928));
        charTable.put(
"Rho"new Character((char929));
        charTable.put(
"Sigma"new Character((char931));
        charTable.put(
"Tau"new Character((char932));
        charTable.put(
"Upsilon"new Character((char933));
        charTable.put(
"hi"new Character((char934));
        charTable.put(
"Chi"new Character((char935));
        charTable.put(
"si"new Character((char936));
        charTable.put(
"Omega"new Character((char937));
        charTable.put(
"alpha"new Character((char945));
        charTable.put(
"beta"new Character((char946));
        charTable.put(
"gamma"new Character((char947));
        charTable.put(
"delta"new Character((char948));
        charTable.put(
"epsilon"new Character((char949));
        charTable.put(
"zeta"new Character((char950));
        charTable.put(
"eta"new Character((char951));
        charTable.put(
"theta"new Character((char952));
        charTable.put(
"iota"new Character((char953));
        charTable.put(
"kappa"new Character((char954));
        charTable.put(
"lambda"new Character((char955));
        charTable.put(
"mu"new Character((char956));
        charTable.put(
"nu"new Character((char957));
        charTable.put(
"xi"new Character((char958));
        charTable.put(
"omicron"new Character((char959));
        charTable.put(
"pi"new Character((char960));
        charTable.put(
"rho"new Character((char961));
        charTable.put(
"sigmaf"new Character((char962));
        charTable.put(
"sigma"new Character((char963));
        charTable.put(
"tau"new Character((char964));
        charTable.put(
"upsilon"new Character((char965));
        charTable.put(
"phi"new Character((char966));
        charTable.put(
"chi"new Character((char967));
        charTable.put(
"psi"new Character((char968));
        charTable.put(
"omega"new Character((char969));
        charTable.put(
"thetasym"new Character((char977));
        charTable.put(
"upsih"new Character((char978));
        charTable.put(
"piv"new Character((char982));
        charTable.put(
"ensp"new Character((char8194));
        charTable.put(
"emsp"new Character((char8195));
        charTable.put(
"thinsp"new Character((char8201));
        charTable.put(
"zwnj"new Character((char8204));
        charTable.put(
"zwj"new Character((char8205));
        charTable.put(
"lrm"new Character((char8206));
        charTable.put(
"rlm"new Character((char8207));
        charTable.put(
"ndash"new Character((char8211));
        charTable.put(
"mdash"new Character((char8212));
        charTable.put(
"lsquo"new Character((char8216));
        charTable.put(
"rsquo"new Character((char8217));
        charTable.put(
"sbquo"new Character((char8218));
        charTable.put(
"ldquo"new Character((char8220));
        charTable.put(
"rdquo"new Character((char8221));
        charTable.put(
"bdquo"new Character((char8222));
        charTable.put(
"dagger"new Character((char8224));
        charTable.put(
"Dagger"new Character((char8225));
        charTable.put(
"bull"new Character((char8226));
        charTable.put(
"hellip"new Character((char8230));
        charTable.put(
"permil"new Character((char8240));
        charTable.put(
"prime"new Character((char8242));
        charTable.put(
"rime"new Character((char8243));
        charTable.put(
"lsaquo"new Character((char8249));
        charTable.put(
"rsaquo"new Character((char8250));
        charTable.put(
"oline"new Character((char8254));
        charTable.put(
"frasl"new Character((char8260));
        charTable.put(
"euro"new Character((char8364));
        charTable.put(
"image"new Character((char8465));
        charTable.put(
"weierp"new Character((char8472));
        charTable.put(
"real"new Character((char8476));
        charTable.put(
"trade"new Character((char8482));
        charTable.put(
"alefsym"new Character((char8501));
        charTable.put(
"larr"new Character((char8592));
        charTable.put(
"uarr"new Character((char8593));
        charTable.put(
"rarr"new Character((char8594));
        charTable.put(
"darr"new Character((char8595));
        charTable.put(
"harr"new Character((char8596));
        charTable.put(
"crarr"new Character((char8629));
        charTable.put(
"lArr"new Character((char8656));
        charTable.put(
"uArr"new Character((char8657));
        charTable.put(
"rArr"new Character((char8658));
        charTable.put(
"dArr"new Character((char8659));
        charTable.put(
"hArr"new Character((char8660));
        charTable.put(
"forall"new Character((char8704));
        charTable.put(
"part"new Character((char8706));
        charTable.put(
"exist"new Character((char8707));
        charTable.put(
"empty"new Character((char8709));
        charTable.put(
"nabla"new Character((char8711));
        charTable.put(
"isin"new Character((char8712));
        charTable.put(
"notin"new Character((char8713));
        charTable.put(
"ni"new Character((char8715));
        charTable.put(
"prod"new Character((char8719));
        charTable.put(
"sum"new Character((char8721));
        charTable.put(
"minus"new Character((char8722));
        charTable.put(
"lowast"new Character((char8727));
        charTable.put(
"radic"new Character((char8730));
        charTable.put(
"prop"new Character((char8733));
        charTable.put(
"infin"new Character((char8734));
        charTable.put(
"ang"new Character((char8736));
        charTable.put(
"and"new Character((char8743));
        charTable.put(
"or"new Character((char8744));
        charTable.put(
"cap"new Character((char8745));
        charTable.put(
"cup"new Character((char8746));
        charTable.put(
"int"new Character((char8747));
        charTable.put(
"there4"new Character((char8756));
        charTable.put(
"sim"new Character((char8764));
        charTable.put(
"cong"new Character((char8773));
        charTable.put(
"asymp"new Character((char8776));
        charTable.put(
"ne"new Character((char8800));
        charTable.put(
"equiv"new Character((char8801));
        charTable.put(
"le"new Character((char8804));
        charTable.put(
"ge"new Character((char8805));
        charTable.put(
"sub"new Character((char8834));
        charTable.put(
"sup"new Character((char8835));
        charTable.put(
"nsub"new Character((char8836));
        charTable.put(
"sube"new Character((char8838));
        charTable.put(
"supe"new Character((char8839));
        charTable.put(
"oplus"new Character((char8853));
        charTable.put(
"otimes"new Character((char8855));
        charTable.put(
"perp"new Character((char8869));
        charTable.put(
"sdot"new Character((char8901));
        charTable.put(
"lceil"new Character((char8968));
        charTable.put(
"rceil"new Character((char8969));
        charTable.put(
"lfloor"new Character((char8970));
        charTable.put(
"rfloor"new Character((char8971));
        charTable.put(
"lang"new Character((char9001));
        charTable.put(
"rang"new Character((char9002));
        charTable.put(
"loz"new Character((char9674));
        charTable.put(
"spades"new Character((char9824));
        charTable.put(
"clubs"new Character((char9827));
        charTable.put(
"hearts"new Character((char9829));
        charTable.put(
"diams"new Character((char9830));
    }

}

 

原创粉丝点击