BDB 封装

来源:互联网 发布:购买域名送空间 编辑:程序博客网 时间:2024/05/16 08:24
/* *  This file is part of the Heritrix web crawler (crawler.archive.org). * *  Licensed to the Internet Archive (IA) by one or more individual  *  contributors.  * *  The IA licenses this file to You under the Apache License, Version 2.0 *  (the "License"); you may not use this file except in compliance with *  the License.  You may obtain a copy of the License at * *      http://www.apache.org/licenses/LICENSE-2.0 * *  Unless required by applicable law or agreed to in writing, software *  distributed under the License is distributed on an "AS IS" BASIS, *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *  See the License for the specific language governing permissions and *  limitations under the License. */package org.archive.crawler.frontier;import java.io.IOException;import java.io.UnsupportedEncodingException;import java.math.BigInteger;import java.util.ArrayList;import java.util.logging.Level;import java.util.logging.Logger;import java.util.regex.Pattern;import javax.management.openmbean.CompositeData;import javax.management.openmbean.CompositeDataSupport;import javax.management.openmbean.OpenDataException;import org.apache.commons.collections.Closure;import org.archive.bdb.KryoBinding;import org.archive.modules.CrawlURI;import org.archive.util.ArchiveUtils;import com.google.common.base.Charsets;import com.sleepycat.bind.EntryBinding;import com.sleepycat.bind.serial.StoredClassCatalog;import com.sleepycat.je.Cursor;import com.sleepycat.je.Database;import com.sleepycat.je.DatabaseEntry;import com.sleepycat.je.DatabaseException;import com.sleepycat.je.OperationStatus;import com.sleepycat.util.RuntimeExceptionWrapper;/** * A BerkeleyDB-database-backed structure for holding ordered * groupings of CrawlURIs. Reading the groupings from specific * per-grouping (per-classKey/per-Host) starting points allows * this to act as a collection of independent queues.  *  * <p>For how the bdb keys are made, see {@link #calculateInsertKey(CrawlURI)}. *  * <p>TODO: refactor, improve naming. *  * @author gojomo */public class BdbMultipleWorkQueues {private static final long serialVersionUID = 1L;    private static final Logger LOGGER =        Logger.getLogger(BdbMultipleWorkQueues.class.getName());        /** Database holding all pending URIs, grouped in virtual queues       * 数据库保存所有带处理的URI,分组在虚拟的队列     * */    private Database pendingUrisDB = null;        /**  Supporting bdb serialization of CrawlURIs */    private EntryBinding<CrawlURI> crawlUriBinding;    /**     * Create the multi queue in the given environment.      *      * @param env bdb environment to use     * @param classCatalog Class catalog to use.     * @param recycle True if we are to reuse db content if any.     * @throws DatabaseException     */    public BdbMultipleWorkQueues(Database db,        StoredClassCatalog classCatalog)    throws DatabaseException {        this.pendingUrisDB = db;        crawlUriBinding =              new KryoBinding<CrawlURI>(CrawlURI.class);//            new RecyclingSerialBinding<CrawlURI>(classCatalog, CrawlURI.class);//            new BenchmarkingBinding<CrawlURI>(new EntryBinding[] {//                new KryoBinding<CrawlURI>(CrawlURI.class,true),//                new KryoBinding<CrawlURI>(CrawlURI.class,false),                    //                new RecyclingSerialBinding<CrawlURI>(classCatalog, CrawlURI.class),//            });                }    /**     * Delete all CrawlURIs matching the given expression.     * JE的记录包含两部分,key键值和value数据值,     * 这两个值都是通过DatabaseEntry对象封装起来,     * 所以说如果要使用记录,则你必须创建两个DatabaseEntry对象,     * 一个是用来做为key,另外一个是做为value.     * @param match     * @param queue     * @param headKey     * @return count of deleted items     * @throws DatabaseException     * @throws DatabaseException     */    public long deleteMatchingFromQueue(String match, String queue,            DatabaseEntry headKey) throws DatabaseException {        long deletedCount = 0;//定义删除数量        Pattern pattern = Pattern.compile(match);//得到匹配模式        DatabaseEntry key = headKey;//从头key开始搜索        DatabaseEntry value = new DatabaseEntry();//存放搜索到的数据        Cursor cursor = null;//定义游标,游标一定要在使用后关闭        try {            cursor = pendingUrisDB.openCursor(null, null);//获取游标            OperationStatus result = cursor.getSearchKeyRange(headKey,                    value, null);//从第一个key开始的所有结果都要遍历            while (result == OperationStatus.SUCCESS) {//如果找到了记录                if(value.getData().length>0) {//获取数据都是byte[]数据,可以转换为JAVA对象,对象必须可以序列化                    CrawlURI curi = (CrawlURI) crawlUriBinding                            .entryToObject(value);                    if (!curi.getClassKey().equals(queue)) {//进一步确认找到的是否是在指定队列中,每个队列有一个classkey,如果不在指定的队列中了对跳出循环                        // rolled into next queue; finished with this queue                        break;                    }                    if (pattern.matcher(curi.toString()).matches()) {//判断URL是否匹配正则                        cursor.delete();//匹配就删除记录                        deletedCount++;//删除个数增加                    }                }                result = cursor.getNext(key, value, null);            }        } finally {            if (cursor != null) {                cursor.close();            }        }        return deletedCount;    }        /**     * @param m marker or null to start with first entry     * @param maxMatches     * @return list of matches starting from marker position     * @throws DatabaseException     */    public CompositeData getFrom(            String m,             int maxMatches,             Pattern pattern,             boolean verbose)     throws DatabaseException {        int matches = 0;        int tries = 0;        ArrayList<String> results = new ArrayList<String>(maxMatches);                DatabaseEntry key;        if (m == null) {            key = getFirstKey();        } else {            byte[] marker = m.getBytes(); // = FrontierJMXTypes.fromString(m);            key = new DatabaseEntry(marker);        }        DatabaseEntry value = new DatabaseEntry();                Cursor cursor = null;        OperationStatus result = null;        try {            cursor = pendingUrisDB.openCursor(null,null);            result = cursor.getSearchKey(key, value, null);                        while(matches < maxMatches && result == OperationStatus.SUCCESS) {                if(value.getData().length>0) {                    CrawlURI curi = (CrawlURI) crawlUriBinding.entryToObject(value);                    if(pattern.matcher(curi.toString()).matches()) {                        if (verbose) {                            results.add("[" + curi.getClassKey() + "] "                                     + curi.shortReportLine());                        } else {                            results.add(curi.toString());                        }                        matches++;                    }                    tries++;                }                result = cursor.getNext(key,value,null);            }        } finally {            if (cursor !=null) {                cursor.close();            }        }                if(result != OperationStatus.SUCCESS) {            // end of scan            m = null;        } else {            m = new String(key.getData()); // = FrontierJMXTypes.toString(key.getData());        }                String[] arr = results.toArray(new String[results.size()]);        CompositeData cd;        try {            cd = new CompositeDataSupport(                    /*FrontierJMXTypes.URI_LIST_DATA*/ null,                    new String[] { "list", "marker" },                    new Object[] { arr, m });        } catch (OpenDataException e) {            throw new IllegalStateException(e);        }        return cd;    }        /**     * @return the key to the first item in the database     * @throws DatabaseException     */    protected DatabaseEntry getFirstKey() throws DatabaseException {        DatabaseEntry key = new DatabaseEntry();        DatabaseEntry value = new DatabaseEntry();        Cursor cursor = pendingUrisDB.openCursor(null,null);        OperationStatus status = cursor.getNext(key,value,null);        cursor.close();        if(status == OperationStatus.SUCCESS) {            return key;        }        return null;    }        /**     * Get the next nearest item after the given key. Relies on      * external discipline -- we'll look at the queues count of how many     * items it has -- to avoid asking for something from a     * range where there are no associated items --     * otherwise could get first item of next 'queue' by mistake.      *      * <p>TODO: hold within a queue's range     *      * @param headKey Key prefix that demarks the beginning of the range     * in <code>pendingUrisDB</code> we're interested in.     * @return CrawlURI.     * @throws DatabaseException     */    public CrawlURI get(DatabaseEntry headKey)    throws DatabaseException {        DatabaseEntry result = new DatabaseEntry();                // From Linda Lee of sleepycat:        // "You want to check the status returned from Cursor.getSearchKeyRange        // to make sure that you have OperationStatus.SUCCESS. In that case,        // you have found a valid data record, and result.getData()        // (called by internally by the binding code, in this case) will be        // non-null. The other possible status return is        // OperationStatus.NOTFOUND, in which case no data record matched        // the criteria. "        OperationStatus status = getNextNearestItem(headKey, result);        CrawlURI retVal = null;        if (status != OperationStatus.SUCCESS) {            LOGGER.severe("See '1219854 NPE je-2.0 "                    + "entryToObject...'. OperationStatus "                    + " was not SUCCESS: "                    + status                    + ", headKey "                    + BdbWorkQueue.getPrefixClassKey(headKey.getData()));            return null;        }               try {            retVal = (CrawlURI)crawlUriBinding.entryToObject(result);        } catch (ClassCastException cce) {            Object obj = crawlUriBinding.entryToObject(result);            LOGGER.log(Level.SEVERE,                    "see [#HER-1283]: deserialized " + obj.getClass()                     + " has ClassLoader "                     + obj.getClass().getClassLoader().getClass(),                    cce);            return null;         } catch (RuntimeExceptionWrapper rw) {            LOGGER.log(                Level.SEVERE,                "expected object missing in queue " +                BdbWorkQueue.getPrefixClassKey(headKey.getData()),                rw);            return null;         }        retVal.setHolderKey(headKey);        return retVal;    }        protected OperationStatus getNextNearestItem(DatabaseEntry headKey,            DatabaseEntry result) throws DatabaseException {        Cursor cursor = null;        OperationStatus status;        try {            cursor = this.pendingUrisDB.openCursor(null, null);                        // get cap; headKey at this point should always point to             // a queue-beginning cap entry (zero-length value)            status = cursor.getSearchKey(headKey, result, null);            if (status != OperationStatus.SUCCESS) {                LOGGER.severe("bdb queue cap missing: "                         + status.toString() + " "  + new String(headKey.getData()));                return status;            }            if (result.getData().length > 0) {                LOGGER.severe("bdb queue has nonzero size: "                         + result.getData().length);                return OperationStatus.KEYEXIST;            }            // get next item (real first item of queue)            status = cursor.getNext(headKey,result,null);        } finally {             if(cursor!=null) {                cursor.close();            }        }        return status;    }    /**     * Put the given CrawlURI in at the appropriate place.      *      * @param curi     * @throws DatabaseException     */    public void put(CrawlURI curi, boolean overwriteIfPresent)     throws DatabaseException {        DatabaseEntry insertKey = (DatabaseEntry)curi.getHolderKey();        if (insertKey == null) {            insertKey = calculateInsertKey(curi);            curi.setHolderKey(insertKey);        }        DatabaseEntry value = new DatabaseEntry();        crawlUriBinding.objectToEntry(curi, value);        // Output tally on avg. size if level is FINE or greater.        if (LOGGER.isLoggable(Level.FINE)) {            tallyAverageEntrySize(curi, value);        }        OperationStatus status;        if(overwriteIfPresent) {            status = pendingUrisDB.put(null, insertKey, value);        } else {            status = pendingUrisDB.putNoOverwrite(null, insertKey, value);        }                if (status!=OperationStatus.SUCCESS) {            LOGGER.log(Level.SEVERE,"URI enqueueing failed; "+status+ " "+curi, new RuntimeException());        }    }        private long entryCount = 0;    private long entrySizeSum = 0;    private int largestEntry = 0;        /**     * Log average size of database entry.     * @param curi CrawlURI this entry is for.     * @param value Database entry value.     */    private synchronized void tallyAverageEntrySize(CrawlURI curi,            DatabaseEntry value) {        entryCount++;        int length = value.getData().length;        entrySizeSum += length;        int avg = (int) (entrySizeSum/entryCount);        if(entryCount % 1000 == 0) {            LOGGER.fine("Average entry size at "+entryCount+": "+avg);        }        if (length>largestEntry) {            largestEntry = length;             LOGGER.fine("Largest entry: "+length+" "+curi);            if(length>(2*avg)) {                LOGGER.fine("excessive?");            }        }    }    /**     * Calculate the 'origin' key for a virtual queue of items     * with the given classKey. This origin key will be a      * prefix of the keys for all items in the queue.      *      * @param classKey String key to derive origin byte key from      * @return a byte array key      */    static byte[] calculateOriginKey(String classKey) {        byte[] classKeyBytes = null;        int len = 0;        try {            classKeyBytes = classKey.getBytes("UTF-8");            len = classKeyBytes.length;        } catch (UnsupportedEncodingException e) {            // should be impossible; all JVMs must support UTF-8            e.printStackTrace();        }        byte[] keyData = new byte[len+1];        System.arraycopy(classKeyBytes,0,keyData,0,len);        keyData[len]=0;        return keyData;    }        /**     * Calculate the insertKey that places a CrawlURI in the     * desired spot. First bytes are always classKey (usu. host)     * based -- ensuring grouping by host -- terminated by a zero     * byte. Then 8 bytes of data ensuring desired ordering      * within that 'queue' are used. The first byte of these 8 is     * priority -- allowing 'immediate' and 'soon' items to      * sort above regular. Next 1 byte is 'precedence'. Last 6 bytes      * are ordinal serial number, ensuring earlier-discovered      * URIs sort before later.      *      * NOTE: Dangers here are:     * (1) priorities or precedences over 2^7 (signed byte comparison)     * (2) ordinals over 2^48     *      * Package access & static for testing purposes.      *      * @param curi     * @return a DatabaseEntry key for the CrawlURI     */    static DatabaseEntry calculateInsertKey(CrawlURI curi) {        byte[] classKeyBytes = null;        int len = 0;        classKeyBytes = curi.getClassKey().getBytes(Charsets.UTF_8);        len = classKeyBytes.length;        byte[] keyData = new byte[len+9];        System.arraycopy(classKeyBytes,0,keyData,0,len);        keyData[len]=0;        long ordinalPlus = curi.getOrdinal() & 0x0000FFFFFFFFFFFFL;        ordinalPlus =         ((long)curi.getSchedulingDirective() << 56) | ordinalPlus;        long precedence = Math.min(curi.getPrecedence(), 127);        ordinalPlus =         (((precedence) & 0xFFL) << 48) | ordinalPlus;        ArchiveUtils.longIntoByteArray(ordinalPlus, keyData, len+1);        return new DatabaseEntry(keyData);    }            static String insertKeyToString(DatabaseEntry holderKey) {        StringBuilder result = new StringBuilder();        byte[] data = holderKey.getData();        int p = findFirstZero(data);        result.append(new String(data, 0, p));                java.io.ByteArrayInputStream binp =             new java.io.ByteArrayInputStream(data, p + 1, data.length);        java.io.DataInputStream dinp = new java.io.DataInputStream(binp);        long l = 0;        try {            l = dinp.readLong();        } catch (IOException e) {            // TODO Auto-generated catch block            e.printStackTrace();        }        result.append(" blah=").append(l);                return result.toString();    }            private static int findFirstZero(byte[] b) {        for (int i = 0; i < b.length; i++) {            if (b[i] == 0) {                return i;            }        }        return -1;    }        /**     * Delete the given CrawlURI from persistent store. Requires     * the key under which it was stored be available.      *      * @param item     * @throws DatabaseException     */    public void delete(CrawlURI item) throws DatabaseException {        OperationStatus status;        DatabaseEntry de = (DatabaseEntry)item.getHolderKey();        status = pendingUrisDB.delete(null, de);        if (status != OperationStatus.SUCCESS) {            LOGGER.severe("expected item not present: "                    + item                    + "("                    + (new BigInteger(((DatabaseEntry) item.getHolderKey())                            .getData())).toString(16) + ")");        }    }        /**     * Method used by BdbFrontier during checkpointing.     * <p>The backing bdbje database has been marked deferred write so we save     * on writes to disk.  Means no guarantees disk will have whats in memory     * unless a sync is called (Calling sync on the bdbje Environment is not     * sufficent).     * <p>Package access only because only Frontiers of this package would ever     * need access.     * @see <a href="http://www.sleepycat.com/jedocs/GettingStartedGuide/DB.html">Deferred Write Databases</a>     */    void sync() {    if (this.pendingUrisDB == null) {    return;    }        try {            this.pendingUrisDB.sync();        } catch (DatabaseException e) {            e.printStackTrace();        }    }        /**     * clean up      *     */    public void close() {/*        try {            this.pendingUrisDB.close();        } catch (DatabaseException e) {            e.printStackTrace();        } */    }        /**     * Add a dummy 'cap' entry at the given insertion key. Prevents     * 'seeks' to queue heads from holding lock on last item of      * 'preceding' queue. See:     * http://sourceforge.net/tracker/index.php?func=detail&aid=1262665&group_id=73833&atid=539102     *      * @param origin key at which to insert the cap     */    public void addCap(byte[] origin) {        try {            pendingUrisDB.put(null, new DatabaseEntry(origin),                    new DatabaseEntry(new byte[0]));        } catch (DatabaseException e) {            throw new RuntimeException(e);        }    }        /**     * Utility method to perform action for all pending CrawlURI instances.     * @param c Closure action to perform     * @throws DatabaseException     */    protected void forAllPendingDo(Closure c) throws DatabaseException {        DatabaseEntry key = new DatabaseEntry();        DatabaseEntry value = new DatabaseEntry();        Cursor cursor = pendingUrisDB.openCursor(null, null);        while (cursor.getNext(key, value, null) == OperationStatus.SUCCESS) {            if (value.getData().length == 0) {                continue;            }            CrawlURI item = (CrawlURI) crawlUriBinding.entryToObject(value);            c.execute(item);        }        cursor.close();     }}

0 0
原创粉丝点击