Quartz 集群源码分析

项目中使用了Quartz调度框架,出现了多实例重复执行任务的情况,于是就改成了集群配置,具体请参考这篇文章[Spring boot下使用Quartz--多实例解决方案](http://blog.csdn.net/yy756127197/article/details/75980459) 一直在用,没时间去了解它的原理,今天抽空研究下,以此记录下来。



/**     * 初始化paused 和 halted      *      */    QuartzSchedulerThread(QuartzScheduler qs, QuartzSchedulerResources qsRsrcs, boolean setDaemon, int threadPrio) {        super(qs.getSchedulerThreadGroup(), qsRsrcs.getThreadName());        this.qs = qs;        this.qsRsrcs = qsRsrcs;        this.setDaemon(setDaemon);        if(qsRsrcs.isThreadsInheritInitializersClassLoadContext()) {            log.info("QuartzSchedulerThread Inheriting ContextClassLoader of thread: " + Thread.currentThread().getName());            this.setContextClassLoader(Thread.currentThread().getContextClassLoader());        }        this.setPriority(threadPrio);        // start the underlying thread, but put this object into the 'paused'        // state        // so processing doesn't start yet...        paused = true;        halted = new AtomicBoolean(false);    }
@Override    public void run() {        boolean lastAcquireFailed = false;        while (!halted.get()) {            try {                // 检查触发器是否暂停或者停止状态                synchronized (sigLock) {                //AtomicBoolean类halted.get()检查触发器是否是暂停状态                    while (paused && !halted.get()) {                        try {                            // 线程等待                            //sigLock同步对象用来随时唤醒将被触发的Trigger(使用notifyAll来进行对wait中线程的唤醒)                                                        sigLock.wait(1000L);                        } catch (InterruptedException ignore) {                        }                    }                    if (halted.get()) {                        break;                    }                }                //得到可以获得的执行任务的线程数                int availThreadCount = qsRsrcs.getThreadPool().blockForAvailableThreads();                if(availThreadCount > 0) { // will always be true, due to semantics of blockForAvailableThreads...                    List<OperableTrigger> triggers = null;                    long now = System.currentTimeMillis();                    clearSignaledSchedulingChange();                    try {                        //得到需要执行的trigger                        triggers = qsRsrcs.getJobStore().acquireNextTriggers(                                now + idleWaitTime, Math.min(availThreadCount, qsRsrcs.getMaxBatchSize()), qsRsrcs.getBatchTimeWindow());                        lastAcquireFailed = false;                        if (log.isDebugEnabled())                             log.debug("batch acquisition of " + (triggers == null ? 0 : triggers.size()) + " triggers");                    } catch (JobPersistenceException jpe) {                        if(!lastAcquireFailed) {                            qs.notifySchedulerListenersError(                                "An error occurred while scanning for the next triggers to fire.",                                jpe);                        }                        lastAcquireFailed = true;                        continue;                    } catch (RuntimeException e) {                        if(!lastAcquireFailed) {                            getLog().error("quartzSchedulerThreadLoop: RuntimeException "                                    +e.getMessage(), e);                        }                        lastAcquireFailed = true;                        continue;                    }                    if (triggers != null && !triggers.isEmpty()) {                        now = System.currentTimeMillis();                        long triggerTime = triggers.get(0).getNextFireTime().getTime();                        long timeUntilTrigger = triggerTime - now;                        while(timeUntilTrigger > 2) {                            synchronized (sigLock) {                                if (halted.get()) {                                    break;                                }                                if (!isCandidateNewTimeEarlierWithinReason(triggerTime, false)) {                                    try {                                        // we could have blocked a long while                                        // on 'synchronize', so we must recompute                                        now = System.currentTimeMillis();                                        timeUntilTrigger = triggerTime - now;                                        if(timeUntilTrigger >= 1)                                            //等待执行                                            sigLock.wait(timeUntilTrigger);                                    } catch (InterruptedException ignore) {                                    }                                }                            }                            if(releaseIfScheduleChangedSignificantly(triggers, triggerTime)) {                                break;                            }                            now = System.currentTimeMillis();                            timeUntilTrigger = triggerTime - now;                        }                        // this happens if releaseIfScheduleChangedSignificantly decided to release triggers                        if(triggers.isEmpty())                            continue;                        // set triggers to 'executing'                        List<TriggerFiredResult> bndles = new ArrayList<TriggerFiredResult>();                        boolean goAhead = true;                        synchronized(sigLock) {                            goAhead = !halted.get();                        }                        if(goAhead) {                            try {                            //执行                                List<TriggerFiredResult> res = qsRsrcs.getJobStore().triggersFired(triggers);                                if(res != null)                                    bndles = res;                            } catch (SchedulerException se) {                                qs.notifySchedulerListenersError(                                        "An error occurred while firing triggers '"                                                + triggers + "'", se);                                //QTZ-179 : a problem occurred interacting with the triggers from the db                                //we release them and loop again                                for (int i = 0; i < triggers.size(); i++) {                                //释放tigger  qsRsrcs.getJobStore().releaseAcquiredTrigger(triggers.get(i));                                }                                continue;                            }                        }                        for (int i = 0; i < bndles.size(); i++) {                            TriggerFiredResult result =  bndles.get(i);                            TriggerFiredBundle bndle =  result.getTriggerFiredBundle();                            Exception exception = result.getException();                            if (exception instanceof RuntimeException) {                                getLog().error("RuntimeException while firing trigger " + triggers.get(i), exception);                                qsRsrcs.getJobStore().releaseAcquiredTrigger(triggers.get(i));                                continue;                            }                            // it's possible to get 'null' if the triggers was paused,                            // blocked, or other similar occurrences that prevent it being                            // fired at this time...  or if the scheduler was shutdown (halted)                            if (bndle == null) {                                qsRsrcs.getJobStore().releaseAcquiredTrigger(triggers.get(i));                                continue;                            }                            JobRunShell shell = null;                            try {                                shell = qsRsrcs.getJobRunShellFactory().createJobRunShell(bndle);                                shell.initialize(qs);                            } catch (SchedulerException se) {                                qsRsrcs.getJobStore().triggeredJobComplete(triggers.get(i), bndle.getJobDetail(), CompletedExecutionInstruction.SET_ALL_JOB_TRIGGERS_ERROR);                                continue;                            }                            if (qsRsrcs.getThreadPool().runInThread(shell) == false) {                                // this case should never happen, as it is indicative of the                                // scheduler being shutdown or a bug in the thread pool or                                // a thread pool being used concurrently - which the docs                                // say not to do...                                getLog().error("ThreadPool.runInThread() return false!");                                qsRsrcs.getJobStore().triggeredJobComplete(triggers.get(i), bndle.getJobDetail(), CompletedExecutionInstruction.SET_ALL_JOB_TRIGGERS_ERROR);                            }                        }                        continue; // while (!halted)                    }                } else { // if(availThreadCount > 0)                    // should never happen, if threadPool.blockForAvailableThreads() follows contract                    continue; // while (!halted)                }                long now = System.currentTimeMillis();                long waitTime = now + getRandomizedIdleWaitTime();                long timeUntilContinue = waitTime - now;                synchronized(sigLock) {                    try {                      if(!halted.get()) {                        // QTZ-336 A job might have been completed in the mean time and we might have                        // missed the scheduled changed signal by not waiting for the notify() yet                        // Check that before waiting for too long in case this very job needs to be                        // scheduled very soon                        if (!isScheduleChanged()) {                          sigLock.wait(timeUntilContinue);                        }                      }                    } catch (InterruptedException ignore) {                    }                }            } catch(RuntimeException re) {                getLog().error("Runtime error occurred in main trigger firing loop.", re);            }        } // while (!halted)        // drop references to scheduler stuff to aid garbage collection...        qs = null;        qsRsrcs = null;    }

- QuartzScheduler调度线程不断获取trigger,触发trigger,释放trigger
- availThreadCount 必须大于0, 因为肯定至少得有一个线程来处理Trigger
- run就是服务器启动后不断的执行
- qsRsrcs.getJobStore().acquireNextTriggers() : 查找将要执行的tigger
- sigLock.wait() : 等待执行
- qsRsrcs.getJobStore().triggersFired(triggers) : 执行
- qsRsrcs.getJobStore().releaseAcquiredTrigger() :释放

  • sigLock同步对象用来随时唤醒将被触发的Trigger(使用notifyAll来进行对wait中线程的唤醒)(源码如下)
/**     * <p>     * Signals the main processing loop to pause at the next possible point.     * </p>     */    void togglePause(boolean pause) {        synchronized (sigLock) {            paused = pause;            if (paused) {                signalSchedulingChange(0);            } else {                sigLock.notifyAll();            }        }    }

可以看出acquireNextTriggers、 triggersFired、 releaseAcquiredTrigger方法都进行了加锁处理
trigger相关操作,都必须获得 TRIGGER_ACCESS锁


acquireNextTriggers() 源码解析

 @SuppressWarnings("unchecked")    public List<OperableTrigger> acquireNextTriggers(final long noLaterThan, final int maxCount, final long timeWindow)        throws JobPersistenceException {        String lockName;        if(isAcquireTriggersWithinLock() || maxCount > 1) {             lockName = LOCK_TRIGGER_ACCESS;        } else {            lockName = null;        }        return executeInNonManagedTXLock(lockName,                 new TransactionCallback<List<OperableTrigger>>() {                    public List<OperableTrigger> execute(Connection conn) throws JobPersistenceException {                        return acquireNextTrigger(conn, noLaterThan, maxCount, timeWindow);                    }                },                new TransactionValidator<List<OperableTrigger>>() {                    public Boolean validate(Connection conn, List<OperableTrigger> result) throws JobPersistenceException {                        try {                            List<FiredTriggerRecord> acquired = getDelegate().selectInstancesFiredTriggerRecords(conn, getInstanceId());                            Set<String> fireInstanceIds = new HashSet<String>();                            for (FiredTriggerRecord ft : acquired) {                                fireInstanceIds.add(ft.getFireInstanceId());                            }                            for (OperableTrigger tr : result) {                                if (fireInstanceIds.contains(tr.getFireInstanceId())) {                                    return true;                                }                            }                            return false;                        } catch (SQLException e) {                            throw new JobPersistenceException("error validating trigger acquisition", e);                        }                    }                });    }


protected static final String LOCK_TRIGGER_ACCESS = "TRIGGER_ACCESS";  


sched_name lock_name schedulerFactoryBean STATE_ACCESS schedulerFactoryBean TRIGGER_ACCESS

这个常量传入executeInNonManagedTXLock(): 处理逻辑前先要获取锁, 处理完成后在finally里面释放锁


 protected <T> T executeInNonManagedTXLock(            String lockName,             TransactionCallback<T> txCallback, final TransactionValidator<T> txValidator) throws JobPersistenceException {        boolean transOwner = false;        Connection conn = null;        try {            if (lockName != null) {                // If we aren't using db locks, then delay getting DB connection                 // until after acquiring the lock since it isn't needed.                if (getLockHandler().requiresConnection()) {                    conn = getNonManagedTXConnection();                }                 // 获取锁                  transOwner = getLockHandler().obtainLock(conn, lockName);            }            if (conn == null) {                conn = getNonManagedTXConnection();            }            final T result = txCallback.execute(conn);            try {                commitConnection(conn);            } catch (JobPersistenceException e) {                rollbackConnection(conn);                if (txValidator == null || !retryExecuteInNonManagedTXLock(lockName, new TransactionCallback<Boolean>() {                    @Override                    public Boolean execute(Connection conn) throws JobPersistenceException {                        return txValidator.validate(conn, result);                    }                })) {                    throw e;                }            }            Long sigTime = clearAndGetSignalSchedulingChangeOnTxCompletion();            if(sigTime != null && sigTime >= 0) {                signalSchedulingChangeImmediately(sigTime);            }            return result;        } catch (JobPersistenceException e) {            rollbackConnection(conn);            throw e;        } catch (RuntimeException e) {            rollbackConnection(conn);            throw new JobPersistenceException("Unexpected runtime exception: "                    + e.getMessage(), e);        } finally {            try {                // 释放锁                  releaseLock(lockName, transOwner);            } finally {                cleanupConnection(conn);            }        }    }


initialize 源码

public void initialize(ClassLoadHelper loadHelper,            SchedulerSignaler signaler) throws SchedulerConfigException {        if (dsName == null) {             throw new SchedulerConfigException("DataSource name not set.");         }        classLoadHelper = loadHelper;        if(isThreadsInheritInitializersClassLoadContext()) {            log.info("JDBCJobStore threads will inherit ContextClassLoader of thread: " + Thread.currentThread().getName());            initializersLoader = Thread.currentThread().getContextClassLoader();        }        this.schedSignaler = signaler;        // If the user hasn't specified an explicit lock handler, then         // choose one based on CMT/Clustered/UseDBLocks.        if (getLockHandler() == null) {            // If the user hasn't specified an explicit lock handler,             // then we *must* use DB locks with clustering            if (isClustered()) {                setUseDBLocks(true);            }            if (getUseDBLocks()) {                if(getDriverDelegateClass() != null && getDriverDelegateClass().equals(MSSQLDelegate.class.getName())) {                    if(getSelectWithLockSQL() == null) {                        String msSqlDflt = "SELECT * FROM {0}LOCKS WITH (UPDLOCK,ROWLOCK) WHERE " + COL_SCHEDULER_NAME + " = {1} AND LOCK_NAME = ?";                        getLog().info("Detected usage of MSSQLDelegate class - defaulting 'selectWithLockSQL' to '" + msSqlDflt + "'.");                        setSelectWithLockSQL(msSqlDflt);                    }                }                getLog().info("Using db table-based data access locking (synchronization).");                setLockHandler(new StdRowLockSemaphore(getTablePrefix(), getInstanceName(), getSelectWithLockSQL()));            } else {                getLog().info(                    "Using thread monitor-based data access locking (synchronization).");                //设置LockHandler                setLockHandler(new SimpleSemaphore());            }        }    }


 String msSqlDflt = "SELECT * FROM {0}LOCKS WITH (UPDLOCK,ROWLOCK) WHERE " + COL_SCHEDULER_NAME + " = {1} AND LOCK_NAME = ?";                        getLog().info("Detected usage of MSSQLDelegate class - defaulting 'selectWithLockSQL' to '" + msSqlDflt + "'.");                        setSelectWithLockSQL(msSqlDflt);



select * from QRTZ_LOCKS t where t.lock_name='TRIGGER_ACCESS' for update



乐观锁( Optimistic Locking ) 相对悲观锁而言,乐观锁假设认为数据一般情况下不会造成冲突,所以在数据进行提交更新的时候,才会正式对数据的冲突与否进行检测,如果发现冲突了,则让返回用户错误的信息,让用户决定如何去做。相对于悲观锁,在对数据库进行处理的时候,乐观锁并不会使用数据库提供的锁机制。一般的实现乐观锁的方式就是记录数据版本。

当一个线程执行上述SQL时,若查询结果中包含相关的行,数据库就对该行进行ROW LOCK,此时,若果有另外一个线程执行上述SQL,由于查询出的数据行已经被数据库锁住了,此时这个线程就只能等待,直到执行了commit动作,数据库才会释放了相关行的锁,这个线程才能继续执行。
