hmtl 乱码总结

来源:互联网 发布:ubuntu svn 库建立 编辑:程序博客网 时间:2024/05/20 13:09

  改写源码:


    package org.htmlparser.lexer;  
    import java.io.*;  
    import java.lang.reflect.InvocationTargetException;  
    import java.lang.reflect.Method;  
    import java.net.*;  
    import java.util.zip.*;  
    import org.htmlparser.http.ConnectionManager;  
    import org.htmlparser.util.ParserException;  
      
    // Referenced classes of package org.htmlparser.lexer:  
    //            InputStreamSource, PageIndex, StringSource, Cursor,   
    //            Stream, Source  
      
    public class Page  
        implements Serializable  
    {  
      
        public Page()  
        {  
            this("");  
        }  
      
        public Page(URLConnection connection)  
            throws ParserException  
        {  
            if(null == connection)  
            {  
                throw new IllegalArgumentException("connection cannot be null");  
            } else  
            {  
                setConnection(connection);  
                mBaseUrl = null;  
                return;  
            }  
        }  
      
        public Page(InputStream stream, String charset)  
            throws UnsupportedEncodingException  
        {  
            if(null == stream)  
                throw new IllegalArgumentException("stream cannot be null");  
            if(null == charset)  
                charset = "ISO-8859-1";  
            mSource = new InputStreamSource(stream, charset);  
            mIndex = new PageIndex(this);  
            mConnection = null;  
            mUrl = null;  
            mBaseUrl = null;  
        }  
      
        public Page(String text, String charset)  
        {  
            if(null == text)  
                throw new IllegalArgumentException("text cannot be null");  
            if(null == charset)  
                charset = "ISO-8859-1";  
            mSource = new StringSource(text, charset);  
            mIndex = new PageIndex(this);  
            mConnection = null;  
            mUrl = null;  
            mBaseUrl = null;  
        }  
      
        public Page(String text)  
        {  
            this(text, null);  
        }  
      
        public Page(Source source)  
        {  
            if(null == source)  
            {  
                throw new IllegalArgumentException("source cannot be null");  
            } else  
            {  
                mSource = source;  
                mIndex = new PageIndex(this);  
                mConnection = null;  
                mUrl = null;  
                mBaseUrl = null;  
                return;  
            }  
        }  
      
        public static ConnectionManager getConnectionManager()  
        {  
            return mConnectionManager;  
        }  
      
        public static void setConnectionManager(ConnectionManager manager)  
        {  
            mConnectionManager = manager;  
        }  
      
        public String getCharset(String content)  
        {  
            String CHARSET_STRING = "charset";  
            String ret;  
            if(null == mSource)  
                ret = "ISO-8859-1";  
            else  
                ret = mSource.getEncoding();  
            if(null != content)  
            {  
                int index = content.indexOf("charset");  
                if(index != -1)  
                {  
                    content = content.substring(index + "charset".length()).trim();  
                    if(content.startsWith("="))  
                    {  
                        content = content.substring(1).trim();  
                        index = content.indexOf(";");  
                        if(index != -1)  
                            content = content.substring(0, index);  
                        if(content.startsWith("\"") && content.endsWith("\"") && 1 < content.length())  
                            content = content.substring(1, content.length() - 1);  
                        if(content.startsWith("'") && content.endsWith("'") && 1 < content.length())  
                            content = content.substring(1, content.length() - 1);  
                        ret = findCharset(content, ret);  
                    }  
                }  
            }  
            return ret;  
        }  
      
        public static String findCharset(String name, String fallback)  
        {  
            String ret;  
            try  
            {  
                Class cls = Class.forName("java.nio.charset.Charset");  
                Method method = cls.getMethod("forName", new Class[] {  
                    java.lang.String.class  
                });  
                Object object = method.invoke(null, new Object[] {  
                    name  
                });  
                method = cls.getMethod("name", new Class[0]);  
                object = method.invoke(object, new Object[0]);  
                ret = (String)object;  
            }  
            catch(ClassNotFoundException cnfe)  
            {  
                ret = name;  

            }  
            catch(NoSuchMethodException nsme)  
            {  
                ret = name;  
            }  
            catch(IllegalAccessException ia)  
            {  
                ret = name;  
            }  
            catch(InvocationTargetException ita)  
            {  
                ret = fallback;  
                System.out.println("unable to determine cannonical charset name for " + name + " - using " + fallback);  
            }  
            return ret;  
        }  
      
        private void writeObject(ObjectOutputStream out)  
            throws IOException  
        {  
            if(null != getConnection())  
            {  
                out.writeBoolean(true);  
                out.writeInt(mSource.offset());  
                String href = getUrl();  
                out.writeObject(href);  
                setUrl(getConnection().getURL().toExternalForm());  
                Source source = getSource();  
                mSource = null;  
                PageIndex index = mIndex;  
                mIndex = null;  
                out.defaultWriteObject();  
                mSource = source;  
                mIndex = index;  
            } else  
            {  
                out.writeBoolean(false);  
                String href = getUrl();  
                out.writeObject(href);  
                setUrl(null);  
                out.defaultWriteObject();  
                setUrl(href);  
            }  
        }  
      
        private void readObject(ObjectInputStream in)  
            throws IOException, ClassNotFoundException  
        {  
            boolean fromurl = in.readBoolean();  
            if(fromurl)  
            {  
                int offset = in.readInt();  
                String href = (String)in.readObject();  
                in.defaultReadObject();  
                if(null != getUrl())  
                {  
                    URL url = new URL(getUrl());  
                    try  
                    {  
                        setConnection(url.openConnection());  
                    }  
                    catch(ParserException pe)  
                    {  
                        throw new IOException(pe.getMessage());  
                    }  
                }  
                Cursor cursor = new Cursor(this, 0);  
                for(int i = 0; i < offset; i++)  
                    try  
                    {  
                        getCharacter(cursor);  
                    }  
                    catch(ParserException pe)  
                    {  
                        throw new IOException(pe.getMessage());  
                    }  
      
                setUrl(href);  
            } else  
            {  
                String href = (String)in.readObject();  
                in.defaultReadObject();  
                setUrl(href);  
            }  
        }  
      
        public void reset()  
        {  
            getSource().reset();  
            mIndex = new PageIndex(this);  
        }  
      
        public void close()  
            throws IOException  
        {  
            if(null != getSource())  
                getSource().destroy();  
        }  
      
        protected void finalize()  
            throws Throwable  
        {  
            close();  
        }  
      
        public URLConnection getConnection()  
        {  
            return mConnection;  
        }  
      
        public void setConnection(URLConnection connection)  
            throws ParserException  
        {  
            mConnection = connection;  
            mConnection.setConnectTimeout(6000);  
            mConnection.setReadTimeout(6000);  
            try  
            {  
                getConnection().connect();  
            }  
            catch(UnknownHostException uhe)  
            {  
                throw new ParserException("Connect to " + mConnection.getURL().toExternalForm() + " failed.", uhe);  
            }  
            catch(IOException ioe)  
            {  
                throw new ParserException("Exception connecting to " + mConnection.getURL().toExternalForm() + " (" + ioe.getMessage() + ").", ioe);  
            }  
            String type = getContentType();  
            String charset = getCharset(type);  
            try  
            {  
                String contentEncoding = connection.getContentEncoding();  
                System.out.println("contentEncoding="+contentEncoding);  
                Stream stream;  
                if(null != contentEncoding && -1 != contentEncoding.indexOf("gzip"))  
                    stream = new Stream(new GZIPInputStream(getConnection().getInputStream()));  
                else  
                if(null != contentEncoding && -1 != contentEncoding.indexOf("deflate"))  
                    stream = new Stream(new InflaterInputStream(getConnection().getInputStream(), new Inflater(true)));  
                else{  
                    stream = new Stream(getConnection().getInputStream());  
                }  
      
                try  
                {  
                    /*
                     * 时间:2010年8月6日
                     * 原因:当String charset = getCharset(type);返回来的是ISO-8859-1的时候,需要处理一下
                     */  
                    if(charset.indexOf("ISO-8859-1")!=-1){  
                          
                        charset =getGaoBinDEFAULT_CHARSET() ;  
                          
                    }  
            mSource = new InputStreamSource(stream, charset);  
                }  
                catch(UnsupportedEncodingException uee)  
                {  
                    charset = "ISO-8859-1";  
                    mSource = new InputStreamSource(stream, charset);  
                }  
            }  
            catch(IOException ioe)  
            {  
                throw new ParserException("Exception getting input stream from " + mConnection.getURL().toExternalForm() + " (" + ioe.getMessage() + ").", ioe);  
            }  
            mUrl = connection.getURL().toExternalForm();  
            mIndex = new PageIndex(this);  
        }  
      
        public String getUrl()  
        {  
            return mUrl;  
        }  
      
        public void setUrl(String url)  
        {  
            mUrl = url;  
        }  
      
        public String getBaseUrl()  
        {  
            return mBaseUrl;  
        }  
      
        public void setBaseUrl(String url)  
        {  
            mBaseUrl = url;  
        }  
      
        public Source getSource()  
        {  
            return mSource;  
        }  
      
        public String getContentType()  
        {  
            String ret = "text/html";  
            URLConnection connection = getConnection();  
            if(null != connection)  
            {  
                String content = connection.getHeaderField("Content-Type");  
                if(null != content)  
                    ret = content;  
            }  
            return ret;  
        }  
      
        public char getCharacter(Cursor cursor)  
            throws ParserException  
        {  
            int i = cursor.getPosition();  
            int offset = mSource.offset();  
            char ret;  
            if(offset == i)  
                try  
                {  
                    i = mSource.read();  
                    if(-1 == i)  
                    {  
                        ret = '\uFFFF';  
                    } else  
                    {  
                        ret = (char)i;  
                        cursor.advance();  
                    }  
                }  
                catch(IOException ioe)  
                {  
                    throw new ParserException("problem reading a character at position " + cursor.getPosition(), ioe);  
                }  
            else  
            if(offset > i)  
            {  
                try  
                {  
                    ret = mSource.getCharacter(i);  
                }  
                catch(IOException ioe)  
                {  
                    throw new ParserException("can't read a character at position " + i, ioe);  
                }  
                cursor.advance();  
            } else  
            {  
                throw new ParserException("attempt to read future characters from source " + i + " > " + mSource.offset());  
            }  
            if('\r' == ret)  
            {  
                ret = '\n';  
                if(mSource.offset() == cursor.getPosition())  
                    try  
                    {  
                        i = mSource.read();  
                        if(-1 != i)  
                            if('\n' == (char)i)  
                                cursor.advance();  
                            else  
                                try  
                                {  
                                    mSource.unread();  
                                }  
                                catch(IOException ioe)  
                                {  
                                    throw new ParserException("can't unread a character at position " + cursor.getPosition(), ioe);  
                                }  
                    }  
                    catch(IOException ioe)  
                    {  
                        throw new ParserException("problem reading a character at position " + cursor.getPosition(), ioe);  
                    }  
                else  
                    try  
                    {  
                        if('\n' == mSource.getCharacter(cursor.getPosition()))  
                            cursor.advance();  
                    }  
                    catch(IOException ioe)  
                    {  
                        throw new ParserException("can't read a character at position " + cursor.getPosition(), ioe);  
                    }  
            }  
            if('\n' == ret)  
                mIndex.add(cursor);  
            return ret;  
        }  
      
        public void ungetCharacter(Cursor cursor)  
            throws ParserException  
        {  
            cursor.retreat();  
            int i = cursor.getPosition();  
            try  
            {  
                char ch = mSource.getCharacter(i);  
                if('\n' == ch && 0 != i)  
                {  
                    ch = mSource.getCharacter(i - 1);  
                    if('\r' == ch)  
                        cursor.retreat();  
                }  
            }  
            catch(IOException ioe)  
            {  
                throw new ParserException("can't read a character at position " + cursor.getPosition(), ioe);  
            }  
        }  
      
        public String getEncoding()  
        {  
            return getSource().getEncoding();  
        }  
      
        public void setEncoding(String character_set)  
            throws ParserException  
        {  
            Page.GaoBinDEFAULT_CHARSET = character_set;  
            getSource().setEncoding(character_set);  
        }  
      
        public URL constructUrl(String link, String base)  
            throws MalformedURLException  
        {  
            return constructUrl(link, base, false);  
        }  
      
        public URL constructUrl(String link, String base, boolean strict)  
            throws MalformedURLException  
        {  
            int index;  
            URL url;  
            if(!strict && '?' == link.charAt(0))  
            {  
                if(-1 != (index = base.lastIndexOf('?')))  
                    base = base.substring(0, index);  
                url = new URL(base + link);  
            } else  
            {  
                url = new URL(new URL(base), link);  
            }  
            String path = url.getFile();  
            boolean modified = false;  
            boolean absolute = link.startsWith("/");  
            if(!absolute)  
                do  
                {  
                    if(!path.startsWith("/."))  
                        break;  
                    if(path.startsWith("/../"))  
                    {  
                        path = path.substring(3);  
                        modified = true;  
                        continue;  
                    }  
                    if(!path.startsWith("/./") && !path.startsWith("/."))  
                        break;  
                    path = path.substring(2);  
                    modified = true;  
                } while(true);  
            while(-1 != (index = path.indexOf("/\\")))   
            {  
                path = path.substring(0, index + 1) + path.substring(index + 2);  
                modified = true;  
            }  
            if(modified)  
                url = new URL(url, path);  
            return url;  
        }  
      
        public String getAbsoluteURL(String link)  
        {  
            return getAbsoluteURL(link, false);  
        }  
      
        public String getAbsoluteURL(String link, boolean strict)  
        {  
            String ret;  
            if(null == link || "".equals(link))  
                ret = "";  
            else  
                try  
                {  
                    String base = getBaseUrl();  
                    if(null == base)  
                        base = getUrl();  
                    if(null == base)  
                    {  
                        ret = link;  
                    } else  
                    {  
                        URL url = constructUrl(link, base, strict);  
                        ret = url.toExternalForm();  
                    }  
                }  
                catch(MalformedURLException murle)  
                {  
                    ret = link;  
                }  
            return ret;  
        }  
      
        public int row(Cursor cursor)  
        {  
            return mIndex.row(cursor);  
        }  
      
        public int row(int position)  
        {  
            return mIndex.row(position);  
        }  
      
        public int column(Cursor cursor)  
        {  
            return mIndex.column(cursor);  
        }  
      
        public int column(int position)  
        {  
            return mIndex.column(position);  
        }  
      
        public String getText(int start, int end)  
            throws IllegalArgumentException  
        {  
            String ret;  
            try  
            {  
                ret = mSource.getString(start, end - start);  
            }  
            catch(IOException ioe)  
            {  
                throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());  
            }  
            return ret;  
        }  
      
        public void getText(StringBuffer buffer, int start, int end)  
            throws IllegalArgumentException  
        {  
            if(mSource.offset() < start || mSource.offset() < end)  
                throw new IllegalArgumentException("attempt to extract future characters from source" + start + "|" + end + " > " + mSource.offset());  
            int length;  
            if(end < start)  
            {  
                length = end;  
                end = start;  
                start = length;  
            }  
            length = end - start;  
            try  
            {  
                mSource.getCharacters(buffer, start, length);  
            }  
            catch(IOException ioe)  
            {  
                throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());  
            }  
        }  
      
        public String getText()  
        {  
            return getText(0, mSource.offset());  
        }  
      
        public void getText(StringBuffer buffer)  
        {  
            getText(buffer, 0, mSource.offset());  
        }  
      
        public void getText(char array[], int offset, int start, int end)  
            throws IllegalArgumentException  
        {  
            if(mSource.offset() < start || mSource.offset() < end)  
                throw new IllegalArgumentException("attempt to extract future characters from source");  
            int length;  
            if(end < start)  
            {  
                length = end;  
                end = start;  
                start = length;  
            }  
            length = end - start;  
            try  
            {  
                mSource.getCharacters(array, offset, start, end);  
            }  
            catch(IOException ioe)  
            {  
                throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());  
            }  
        }  
      
        public String getLine(Cursor cursor)  
        {  
            int line = row(cursor);  
            int size = mIndex.size();  
            int start;  
            int end;  
            if(line < size)  
            {  
                start = mIndex.elementAt(line);  
                if(++line <= size)  
                    end = mIndex.elementAt(line);  
                else  
                    end = mSource.offset();  
            } else  
            {  
                start = mIndex.elementAt(line - 1);  
                end = mSource.offset();  
            }  
            return getText(start, end);  
        }  
      
        public String getLine(int position)  
        {  
            return getLine(new Cursor(this, position));  
        }  
      
        public String toString()  
        {  
            String ret;  
            if(mSource.offset() > 0)  
            {  
                StringBuffer buffer = new StringBuffer(43);  
                int start = mSource.offset() - 40;  
                if(0 > start)  
                    start = 0;  
                else  
                    buffer.append("...");  
                getText(buffer, start, mSource.offset());  
                ret = buffer.toString();  
            } else  
            {  
                ret = super.toString();  
            }  
            return ret;  
        }  
      
        public static final String DEFAULT_CHARSET = "ISO-8859-1";  
        public static String GaoBinDEFAULT_CHARSET;  
        public static final String DEFAULT_CONTENT_TYPE = "text/html";  
        public static final char EOF = 65535;  
        protected String mUrl;  
        protected String mBaseUrl;  
        protected Source mSource;  
        protected PageIndex mIndex;  
        protected transient URLConnection mConnection;  
        protected static ConnectionManager mConnectionManager = new ConnectionManager();  
          
        public static String getGaoBinDEFAULT_CHARSET() {  
            return GaoBinDEFAULT_CHARSET;  
        }  
      
        public static void setGaoBinDEFAULT_CHARSET(String gaoBinDEFAULT_CHARSET) {  
            GaoBinDEFAULT_CHARSET = gaoBinDEFAULT_CHARSET;  
        }  
      
    } 

0 0
原创粉丝点击