由TaskTracker启动到Task执行

来源：互联网发布：淘宝手办店推荐编辑：程序博客网时间：2024/06/01 20:54

1 TaskTracker main函数

public static void main(String argv[]) throws Exception {    StringUtils.startupShutdownMessage(TaskTracker.class, argv, LOG);    if (argv.length != 0) {      System.out.println("usage: TaskTracker");      System.exit(-1);    }    try {      JobConf conf=new JobConf();      // enable the server to track time spent waiting on locks      ReflectionUtils.setContentionTracing        (conf.getBoolean("tasktracker.contention.tracking", false));      new TaskTracker(conf).run();    } catch (Throwable e) {      LOG.error("Can not start task tracker because "+                StringUtils.stringifyException(e));      System.exit(-1);    }  }

可知主要是new TaskTracker(conf).run();这句代码，创建一个新的TaskTracker并执行该线程。

2 TaskTracker的构造函数

 public TaskTracker(JobConf conf) throws IOException {    originalConf = conf;    maxCurrentMapTasks = conf.getInt(                  "mapred.tasktracker.map.tasks.maximum", 2);    maxCurrentReduceTasks = conf.getInt(                  "mapred.tasktracker.reduce.tasks.maximum", 2);    this.jobTrackAddr = JobTracker.getAddress(conf);    String infoAddr =       NetUtils.getServerAddress(conf,                                "tasktracker.http.bindAddress",                                 "tasktracker.http.port",                                "mapred.task.tracker.http.address");    InetSocketAddress infoSocAddr = NetUtils.createSocketAddr(infoAddr);    String httpBindAddress = infoSocAddr.getHostName();    int httpPort = infoSocAddr.getPort();    this.server = new HttpServer("task", httpBindAddress, httpPort,        httpPort == 0, conf);    workerThreads = conf.getInt("tasktracker.http.threads", 40);    this.shuffleServerMetrics = new ShuffleServerMetrics(conf);    server.setThreads(1, workerThreads);    // let the jsp pages get to the task tracker, config, and other relevant    // objects    FileSystem local = FileSystem.getLocal(conf);    this.localDirAllocator = new LocalDirAllocator("mapred.local.dir");    server.setAttribute("task.tracker", this);    server.setAttribute("local.file.system", local);    server.setAttribute("conf", conf);    server.setAttribute("log", LOG);    server.setAttribute("localDirAllocator", localDirAllocator);    server.setAttribute("shuffleServerMetrics", shuffleServerMetrics);    server.addInternalServlet("mapOutput", "/mapOutput", MapOutputServlet.class);    server.addInternalServlet("taskLog", "/tasklog", TaskLogServlet.class);    server.start();    this.httpPort = server.getPort();    checkJettyPort(httpPort);    initialize();  }

该函数的主要功能有两个，一个是创建HttpServer，另外一个是initialize，initialize其实才是完成TaskTracker初始化的主要工作

3 initialize函数

synchronized void initialize() throws IOException {    // use configured nameserver & interface to get local hostname    this.fConf = new JobConf(originalConf);    if (fConf.get("slave.host.name") != null) {      this.localHostname = fConf.get("slave.host.name");    }    if (localHostname == null) {      this.localHostname =      DNS.getDefaultHost      (fConf.get("mapred.tasktracker.dns.interface","default"),       fConf.get("mapred.tasktracker.dns.nameserver","default"));    }     //check local disk    checkLocalDirs(this.fConf.getLocalDirs());    fConf.deleteLocalFiles(SUBDIR);    // Clear out state tables    this.tasks.clear();    this.runningTasks = new LinkedHashMap<TaskAttemptID, TaskInProgress>();    this.runningJobs = new TreeMap<JobID, RunningJob>();    this.mapTotal = 0;    this.reduceTotal = 0;    this.acceptNewTasks = true;    this.status = null;    this.minSpaceStart = this.fConf.getLong("mapred.local.dir.minspacestart", 0L);    this.minSpaceKill = this.fConf.getLong("mapred.local.dir.minspacekill", 0L);    //tweak the probe sample size (make it a function of numCopiers)    probe_sample_size = this.fConf.getInt("mapred.tasktracker.events.batchsize", 500);        Class<? extends TaskTrackerInstrumentation> metricsInst = getInstrumentationClass(fConf);    try {      java.lang.reflect.Constructor<? extends TaskTrackerInstrumentation> c =        metricsInst.getConstructor(new Class[] {TaskTracker.class} );      this.myInstrumentation = c.newInstance(this);    } catch(Exception e) {      //Reflection can throw lots of exceptions -- handle them all by       //falling back on the default.      LOG.error("failed to initialize taskTracker metrics", e);      this.myInstrumentation = new TaskTrackerMetricsInst(this);    }        // bind address    String address =       NetUtils.getServerAddress(fConf,                                "mapred.task.tracker.report.bindAddress",                                 "mapred.task.tracker.report.port",                                 "mapred.task.tracker.report.address");    InetSocketAddress socAddr = NetUtils.createSocketAddr(address);    String bindAddress = socAddr.getHostName();    int tmpPort = socAddr.getPort();        this.jvmManager = new JvmManager(this);    // Set service-level authorization security policy    if (this.fConf.getBoolean(          ServiceAuthorizationManager.SERVICE_AUTHORIZATION_CONFIG, false)) {      PolicyProvider policyProvider =         (PolicyProvider)(ReflectionUtils.newInstance(            this.fConf.getClass(PolicyProvider.POLICY_PROVIDER_CONFIG,                 MapReducePolicyProvider.class, PolicyProvider.class),             this.fConf));      SecurityUtil.setPolicy(new ConfiguredPolicy(this.fConf, policyProvider));    }        // RPC initialization    int max = maxCurrentMapTasks > maxCurrentReduceTasks ?                        maxCurrentMapTasks : maxCurrentReduceTasks;    //set the num handlers to max*2 since canCommit may wait for the duration    //of a heartbeat RPC    this.taskReportServer =      RPC.getServer(this, bindAddress, tmpPort, 2 * max, false, this.fConf);    this.taskReportServer.start();    // get the assigned address    this.taskReportAddress = taskReportServer.getListenerAddress();    this.fConf.set("mapred.task.tracker.report.address",        taskReportAddress.getHostName() + ":" + taskReportAddress.getPort());    LOG.info("TaskTracker up at: " + this.taskReportAddress);    this.taskTrackerName = "tracker_" + localHostname + ":" + taskReportAddress;    LOG.info("Starting tracker " + taskTrackerName);    // Clear out temporary files that might be lying around    DistributedCache.purgeCache(this.fConf);    cleanupStorage();    this.jobClient = (InterTrackerProtocol)       RPC.waitForProxy(InterTrackerProtocol.class,                       InterTrackerProtocol.versionID,                        jobTrackAddr, this.fConf);    this.justInited = true;    this.running = true;        // start the thread that will fetch map task completion events    this.mapEventsFetcher = new MapEventsFetcherThread();    mapEventsFetcher.setDaemon(true);    mapEventsFetcher.setName(                             "Map-events fetcher for all reduce tasks " + "on " +                              taskTrackerName);    mapEventsFetcher.start();    initializeMemoryManagement();    this.indexCache = new IndexCache(this.fConf);    mapLauncher = new TaskLauncher(maxCurrentMapTasks);    reduceLauncher = new TaskLauncher(maxCurrentReduceTasks);    mapLauncher.start();    reduceLauncher.start();  }

其主要功能是初始化一般属性，创建一个taskReportServer并通过rpc获得一个jobClient引用，创建jobClient用到的地址在构造函数中已经得到初始化 this.jobTrackAddr = JobTracker.getAddress(conf);。还有创建一个mapLauncher和reduceLauncher并启动。

4 mapLauncher.run()

    public void run() {      while (!Thread.interrupted()) {        try {          TaskInProgress tip;          synchronized (tasksToLaunch) {            while (tasksToLaunch.isEmpty()) {              tasksToLaunch.wait();            }            //get the TIP            tip = tasksToLaunch.remove(0);            LOG.info("Trying to launch : " + tip.getTask().getTaskID());          }          //wait for a slot to run          synchronized (numFreeSlots) {            while (numFreeSlots.get() == 0) {              numFreeSlots.wait();            }            LOG.info("In TaskLauncher, current free slots : " + numFreeSlots.get()+                " and trying to launch "+tip.getTask().getTaskID());            numFreeSlots.set(numFreeSlots.get() - 1);            assert (numFreeSlots.get() >= 0);          }          synchronized (tip) {            //to make sure that there is no kill task action for this            if (tip.getRunState() != TaskStatus.State.UNASSIGNED &&                tip.getRunState() != TaskStatus.State.FAILED_UNCLEAN &&                tip.getRunState() != TaskStatus.State.KILLED_UNCLEAN) {              //got killed externally while still in the launcher queue              addFreeSlot();              continue;            }            tip.slotTaken = true;          }          //got a free slot. launch the task          startNewTask(tip);        } catch (InterruptedException e) {           return; // ALL DONE        } catch (Throwable th) {          LOG.error("TaskLauncher error " +               StringUtils.stringifyException(th));        }      }    }

它先等待任务的到来，然后等待空闲的slot，等两者兼备时选择合适状态的任务执行startNewTask(tip);

5 startNewTask

  private void startNewTask(TaskInProgress tip) {    try {      localizeJob(tip);    } catch (Throwable e) {      String msg = ("Error initializing " + tip.getTask().getTaskID() +                     ":\n" + StringUtils.stringifyException(e));      LOG.warn(msg);      tip.reportDiagnosticInfo(msg);      try {        tip.kill(true);        tip.cleanup(true);      } catch (IOException ie2) {        LOG.info("Error cleaning up " + tip.getTask().getTaskID() + ":\n" +                 StringUtils.stringifyException(ie2));                }              // Careful!       // This might not be an 'Exception' - don't handle 'Error' here!      if (e instanceof Error) {        throw ((Error) e);      }    }  }

直接进入localizeJob

6 localizeJob

private void localizeJob(TaskInProgress tip) throws IOException {    Path localJarFile = null;    Task t = tip.getTask();    JobID jobId = t.getJobID();    Path jobFile = new Path(t.getJobFile());    // Get sizes of JobFile and JarFile    // sizes are -1 if they are not present.    FileStatus status = null;    long jobFileSize = -1;    try {      status = systemFS.getFileStatus(jobFile);      jobFileSize = status.getLen();    } catch(FileNotFoundException fe) {      jobFileSize = -1;    }    Path localJobFile = lDirAlloc.getLocalPathForWrite(                                    getLocalJobDir(jobId.toString())                                    + Path.SEPARATOR + "job.xml",                                    jobFileSize, fConf);    RunningJob rjob = addTaskToJob(jobId, tip);    synchronized (rjob) {      if (!rjob.localized) {          FileSystem localFs = FileSystem.getLocal(fConf);        // this will happen on a partial execution of localizeJob.        // Sometimes the job.xml gets copied but copying job.jar        // might throw out an exception        // we should clean up and then try again        Path jobDir = localJobFile.getParent();        if (localFs.exists(jobDir)){          localFs.delete(jobDir, true);          boolean b = localFs.mkdirs(jobDir);          if (!b)            throw new IOException("Not able to create job directory "                                  + jobDir.toString());        }        systemFS.copyToLocalFile(jobFile, localJobFile);        JobConf localJobConf = new JobConf(localJobFile);                // create the 'work' directory        // job-specific shared directory for use as scratch space         Path workDir = lDirAlloc.getLocalPathForWrite(                         (getLocalJobDir(jobId.toString())                         + Path.SEPARATOR + "work"), fConf);        if (!localFs.mkdirs(workDir)) {          throw new IOException("Mkdirs failed to create "                       + workDir.toString());        }        System.setProperty("job.local.dir", workDir.toString());        localJobConf.set("job.local.dir", workDir.toString());                // copy Jar file to the local FS and unjar it.        String jarFile = localJobConf.getJar();        long jarFileSize = -1;        if (jarFile != null) {          Path jarFilePath = new Path(jarFile);          try {            status = systemFS.getFileStatus(jarFilePath);            jarFileSize = status.getLen();          } catch(FileNotFoundException fe) {            jarFileSize = -1;          }          // Here we check for and we check five times the size of jarFileSize          // to accommodate for unjarring the jar file in work directory           localJarFile = new Path(lDirAlloc.getLocalPathForWrite(                                     getLocalJobDir(jobId.toString())                                     + Path.SEPARATOR + "jars",                                     5 * jarFileSize, fConf), "job.jar");          if (!localFs.mkdirs(localJarFile.getParent())) {            throw new IOException("Mkdirs failed to create jars directory ");           }          systemFS.copyToLocalFile(jarFilePath, localJarFile);          localJobConf.setJar(localJarFile.toString());          OutputStream out = localFs.create(localJobFile);          try {            localJobConf.writeXml(out);          } finally {            out.close();          }          // also unjar the job.jar files           RunJar.unJar(new File(localJarFile.toString()),                       new File(localJarFile.getParent().toString()));        }        rjob.keepJobFiles = ((localJobConf.getKeepTaskFilesPattern() != null) ||                             localJobConf.getKeepFailedTaskFiles());        rjob.localized = true;        rjob.jobConf = localJobConf;      }    }    launchTaskForJob(tip, new JobConf(rjob.jobConf));   }

前面的主要功能是将作业本地化， systemFS.copyToLocalFile(jobFile, localJobFile);systemFS.copyToLocalFile(jarFilePath, localJarFile);这两句完成程序和数据的拷贝。

最后launchTaskForJob(tip, new JobConf(rjob.jobConf));

7 launchTaskForJob

  private void launchTaskForJob(TaskInProgress tip, JobConf jobConf) throws IOException{    synchronized (tip) {      tip.setJobConf(jobConf);      tip.launchTask();    }  }

8 tip.launchTask

    public synchronized void launchTask() throws IOException {      if (this.taskStatus.getRunState() == TaskStatus.State.UNASSIGNED ||          this.taskStatus.getRunState() == TaskStatus.State.FAILED_UNCLEAN ||          this.taskStatus.getRunState() == TaskStatus.State.KILLED_UNCLEAN) {        localizeTask(task);        if (this.taskStatus.getRunState() == TaskStatus.State.UNASSIGNED) {          this.taskStatus.setRunState(TaskStatus.State.RUNNING);        }        this.runner = task.createRunner(TaskTracker.this, this);        this.runner.start();        this.taskStatus.setStartTime(System.currentTimeMillis());      } else {        LOG.info("Not launching task: " + task.getTaskID() +             " since it's state is " + this.taskStatus.getRunState());      }    }

9 TaskRunner.run()

 public final void run() {    try {            //before preparing the job localize       //all the archives      TaskAttemptID taskid = t.getTaskID();      LocalDirAllocator lDirAlloc = new LocalDirAllocator("mapred.local.dir");      File jobCacheDir = null;      if (conf.getJar() != null) {        jobCacheDir = new File(                          new Path(conf.getJar()).getParent().toString());      }      File workDir = new File(lDirAlloc.getLocalPathToRead(                                TaskTracker.getLocalTaskDir(                                   t.getJobID().toString(),                                   t.getTaskID().toString(),                                  t.isTaskCleanupTask())                                + Path.SEPARATOR + MRConstants.WORKDIR,                                conf). toString());      URI[] archives = DistributedCache.getCacheArchives(conf);      URI[] files = DistributedCache.getCacheFiles(conf);      FileStatus fileStatus;      FileSystem fileSystem;      Path localPath;      String baseDir;      if ((archives != null) || (files != null)) {        if (archives != null) {          String[] archivesTimestamps =                                DistributedCache.getArchiveTimestamps(conf);          Path[] p = new Path[archives.length];          for (int i = 0; i < archives.length;i++){            fileSystem = FileSystem.get(archives[i], conf);            fileStatus = fileSystem.getFileStatus(                                      new Path(archives[i].getPath()));            String cacheId = DistributedCache.makeRelative(archives[i],conf);            String cachePath = TaskTracker.getCacheSubdir() +                                  Path.SEPARATOR + cacheId;                        localPath = lDirAlloc.getLocalPathForWrite(cachePath,                                      fileStatus.getLen(), conf);            baseDir = localPath.toString().replace(cacheId, "");            p[i] = DistributedCache.getLocalCache(archives[i], conf,                                                   new Path(baseDir),                                                  fileStatus,                                                  true, Long.parseLong(                                                        archivesTimestamps[i]),                                                  new Path(workDir.                                                        getAbsolutePath()),                                                   false);                      }          DistributedCache.setLocalArchives(conf, stringifyPathArray(p));        }        if ((files != null)) {          String[] fileTimestamps = DistributedCache.getFileTimestamps(conf);          Path[] p = new Path[files.length];          for (int i = 0; i < files.length;i++){            fileSystem = FileSystem.get(files[i], conf);            fileStatus = fileSystem.getFileStatus(                                      new Path(files[i].getPath()));            String cacheId = DistributedCache.makeRelative(files[i], conf);            String cachePath = TaskTracker.getCacheSubdir() +                                 Path.SEPARATOR + cacheId;                        localPath = lDirAlloc.getLocalPathForWrite(cachePath,                                      fileStatus.getLen(), conf);            baseDir = localPath.toString().replace(cacheId, "");            p[i] = DistributedCache.getLocalCache(files[i], conf,                                                   new Path(baseDir),                                                  fileStatus,                                                  false, Long.parseLong(                                                           fileTimestamps[i]),                                                  new Path(workDir.                                                        getAbsolutePath()),                                                   false);          }          DistributedCache.setLocalFiles(conf, stringifyPathArray(p));        }        Path localTaskFile = new Path(t.getJobFile());        FileSystem localFs = FileSystem.getLocal(conf);        localFs.delete(localTaskFile, true);        OutputStream out = localFs.create(localTaskFile);        try {          conf.writeXml(out);        } finally {          out.close();        }      }                if (!prepare()) {        return;      }      String sep = System.getProperty("path.separator");      StringBuffer classPath = new StringBuffer();      // start with same classpath as parent process      classPath.append(System.getProperty("java.class.path"));      classPath.append(sep);      if (!workDir.mkdirs()) {        if (!workDir.isDirectory()) {          LOG.fatal("Mkdirs failed to create " + workDir.toString());        }      }        String jar = conf.getJar();      if (jar != null) {               // if jar exists, it into workDir        File[] libs = new File(jobCacheDir, "lib").listFiles();        if (libs != null) {          for (int i = 0; i < libs.length; i++) {            classPath.append(sep);            // add libs from jar to classpath            classPath.append(libs[i]);          }        }        classPath.append(sep);        classPath.append(new File(jobCacheDir, "classes"));        classPath.append(sep);        classPath.append(jobCacheDir);             }      // include the user specified classpath        //archive paths      Path[] archiveClasspaths = DistributedCache.getArchiveClassPaths(conf);      if (archiveClasspaths != null && archives != null) {        Path[] localArchives = DistributedCache          .getLocalCacheArchives(conf);        if (localArchives != null){          for (int i=0;i<archives.length;i++){            for(int j=0;j<archiveClasspaths.length;j++){              if (archives[i].getPath().equals(                                               archiveClasspaths[j].toString())){                classPath.append(sep);                classPath.append(localArchives[i]                                 .toString());              }            }          }        }      }      //file paths      Path[] fileClasspaths = DistributedCache.getFileClassPaths(conf);      if (fileClasspaths!=null && files != null) {        Path[] localFiles = DistributedCache          .getLocalCacheFiles(conf);        if (localFiles != null) {          for (int i = 0; i < files.length; i++) {            for (int j = 0; j < fileClasspaths.length; j++) {              if (files[i].getPath().equals(                                            fileClasspaths[j].toString())) {                classPath.append(sep);                classPath.append(localFiles[i].toString());              }            }          }        }      }      classPath.append(sep);      classPath.append(workDir);      //  Build exec child jmv args.      Vector<String> vargs = new Vector<String>(8);      File jvm =                                  // use same jvm as parent        new File(new File(System.getProperty("java.home"), "bin"), "java");      vargs.add(jvm.toString());      // Add child (task) java-vm options.      //      // The following symbols if present in mapred.child.java.opts value are      // replaced:      // + @taskid@ is interpolated with value of TaskID.      // Other occurrences of @ will not be altered.      //      // Example with multiple arguments and substitutions, showing      // jvm GC logging, and start of a passwordless JVM JMX agent so can      // connect with jconsole and the likes to watch child memory, threads      // and get thread dumps.      //      //  <property>      //    <name>mapred.child.java.opts</name>      //    <value>-verbose:gc -Xloggc:/tmp/@taskid@.gc \      //           -Dcom.sun.management.jmxremote.authenticate=false \      //           -Dcom.sun.management.jmxremote.ssl=false \      //    </value>      //  </property>      //      String javaOpts = conf.get("mapred.child.java.opts", "-Xmx200m");      javaOpts = javaOpts.replace("@taskid@", taskid.toString());      String [] javaOptsSplit = javaOpts.split(" ");            // Add java.library.path; necessary for loading native libraries.      //      // 1. To support native-hadoop library i.e. libhadoop.so, we add the       //    parent processes' java.library.path to the child.       // 2. We also add the 'cwd' of the task to it's java.library.path to help       //    users distribute native libraries via the DistributedCache.      // 3. The user can also specify extra paths to be added to the       //    java.library.path via mapred.child.java.opts.      //      String libraryPath = System.getProperty("java.library.path");      if (libraryPath == null) {        libraryPath = workDir.getAbsolutePath();      } else {        libraryPath += sep + workDir;      }      boolean hasUserLDPath = false;      for(int i=0; i<javaOptsSplit.length ;i++) {         if(javaOptsSplit[i].startsWith("-Djava.library.path=")) {          javaOptsSplit[i] += sep + libraryPath;          hasUserLDPath = true;          break;        }      }      if(!hasUserLDPath) {        vargs.add("-Djava.library.path=" + libraryPath);      }      for (int i = 0; i < javaOptsSplit.length; i++) {        vargs.add(javaOptsSplit[i]);      }      // add java.io.tmpdir given by mapred.child.tmp      String tmp = conf.get("mapred.child.tmp", "./tmp");      Path tmpDir = new Path(tmp);            // if temp directory path is not absolute       // prepend it with workDir.      if (!tmpDir.isAbsolute()) {        tmpDir = new Path(workDir.toString(), tmp);      }      FileSystem localFs = FileSystem.getLocal(conf);      if (!localFs.mkdirs(tmpDir) && !localFs.getFileStatus(tmpDir).isDir()) {        throw new IOException("Mkdirs failed to create " + tmpDir.toString());      }      vargs.add("-Djava.io.tmpdir=" + tmpDir.toString());      // Add classpath.      vargs.add("-classpath");      vargs.add(classPath.toString());      // Setup the log4j prop      long logSize = TaskLog.getTaskLogLength(conf);      vargs.add("-Dhadoop.log.dir=" +           new File(System.getProperty("hadoop.log.dir")          ).getAbsolutePath());      vargs.add("-Dhadoop.root.logger=INFO,TLA");      vargs.add("-Dhadoop.tasklog.taskid=" + taskid);      vargs.add("-Dhadoop.tasklog.totalLogFileSize=" + logSize);      if (conf.getProfileEnabled()) {        if (conf.getProfileTaskRange(t.isMapTask()                                     ).isIncluded(t.getPartition())) {          File prof = TaskLog.getTaskLogFile(taskid, TaskLog.LogName.PROFILE);          vargs.add(String.format(conf.getProfileParams(), prof.toString()));        }      }      // Add main class and its arguments       vargs.add(Child.class.getName());  // main of Child      // pass umbilical address      InetSocketAddress address = tracker.getTaskTrackerReportAddress();      vargs.add(address.getAddress().getHostAddress());       vargs.add(Integer.toString(address.getPort()));       vargs.add(taskid.toString());                      // pass task identifier      String pidFile = lDirAlloc.getLocalPathForWrite(            (TaskTracker.getPidFile(t.getJobID().toString(),              taskid.toString(), t.isTaskCleanupTask())),            this.conf).toString();      t.setPidFile(pidFile);      tracker.addToMemoryManager(t.getTaskID(), t.isMapTask(), conf, pidFile);      // set memory limit using ulimit if feasible and necessary ...      String[] ulimitCmd = Shell.getUlimitMemoryCommand(conf);      List<String> setup = null;      if (ulimitCmd != null) {        setup = new ArrayList<String>();        for (String arg : ulimitCmd) {          setup.add(arg);        }      }      // Set up the redirection of the task's stdout and stderr streams      File stdout = TaskLog.getTaskLogFile(taskid, TaskLog.LogName.STDOUT);      File stderr = TaskLog.getTaskLogFile(taskid, TaskLog.LogName.STDERR);      stdout.getParentFile().mkdirs();      tracker.getTaskTrackerInstrumentation().reportTaskLaunch(taskid, stdout, stderr);      Map<String, String> env = new HashMap<String, String>();      StringBuffer ldLibraryPath = new StringBuffer();      ldLibraryPath.append(workDir.toString());      String oldLdLibraryPath = null;      oldLdLibraryPath = System.getenv("LD_LIBRARY_PATH");      if (oldLdLibraryPath != null) {        ldLibraryPath.append(sep);        ldLibraryPath.append(oldLdLibraryPath);      }      env.put("LD_LIBRARY_PATH", ldLibraryPath.toString());      jvmManager.launchJvm(this,           jvmManager.constructJvmEnv(setup,vargs,stdout,stderr,logSize,               workDir, env, pidFile, conf));      synchronized (lock) {        while (!done) {          lock.wait();        }      }      tracker.getTaskTrackerInstrumentation().reportTaskEnd(t.getTaskID());      if (exitCodeSet) {        if (!killed && exitCode != 0) {          if (exitCode == 65) {            tracker.getTaskTrackerInstrumentation().taskFailedPing(t.getTaskID());          }          throw new IOException("Task process exit with nonzero status of " +              exitCode + ".");        }      }    } catch (FSError e) {      LOG.fatal("FSError", e);      try {        tracker.fsError(t.getTaskID(), e.getMessage());      } catch (IOException ie) {        LOG.fatal(t.getTaskID()+" reporting FSError", ie);      }    } catch (Throwable throwable) {      LOG.warn(t.getTaskID()+" Child Error", throwable);      ByteArrayOutputStream baos = new ByteArrayOutputStream();      throwable.printStackTrace(new PrintStream(baos));      try {        tracker.reportDiagnosticInfo(t.getTaskID(), baos.toString());      } catch (IOException e) {        LOG.warn(t.getTaskID()+" Reporting Diagnostics", e);      }    } finally {      try{        URI[] archives = DistributedCache.getCacheArchives(conf);        URI[] files = DistributedCache.getCacheFiles(conf);        if (archives != null){          for (int i = 0; i < archives.length; i++){            DistributedCache.releaseCache(archives[i], conf);          }        }        if (files != null){          for(int i = 0; i < files.length; i++){            DistributedCache.releaseCache(files[i], conf);          }        }      }catch(IOException ie){        LOG.warn("Error releasing caches : Cache files might not have been cleaned up");      }      tip.reportTaskFinished();    }  }

10 jvmManager.launchJvm

public void launchJvm(TaskRunner t, JvmEnv env) {    if (t.getTask().isMapTask()) {      mapJvmManager.reapJvm(t, env);    } else {      reduceJvmManager.reapJvm(t, env);    }  }

11 mapJvmManager.reapJvm()

    private synchronized void reapJvm(         TaskRunner t, JvmEnv env) {      if (t.getTaskInProgress().wasKilled()) {        //the task was killed in-flight        //no need to do the rest of the operations        return;      }      boolean spawnNewJvm = false;      JobID jobId = t.getTask().getJobID();      //Check whether there is a free slot to start a new JVM.      //,or, Kill a (idle) JVM and launch a new one      //When this method is called, we *must*       // (1) spawn a new JVM (if we are below the max)       // (2) find an idle JVM (that belongs to the same job), or,      // (3) kill an idle JVM (from a different job)       // (the order of return is in the order above)      int numJvmsSpawned = jvmIdToRunner.size();      JvmRunner runnerToKill = null;      if (numJvmsSpawned >= maxJvms) {        //go through the list of JVMs for all jobs.        Iterator<Map.Entry<JVMId, JvmRunner>> jvmIter =           jvmIdToRunner.entrySet().iterator();                while (jvmIter.hasNext()) {          JvmRunner jvmRunner = jvmIter.next().getValue();          JobID jId = jvmRunner.jvmId.getJobId();          //look for a free JVM for this job; if one exists then just break          if (jId.equals(jobId) && !jvmRunner.isBusy() && !jvmRunner.ranAll()){            setRunningTaskForJvm(jvmRunner.jvmId, t); //reserve the JVM            LOG.info("No new JVM spawned for jobId/taskid: " +                      jobId+"/"+t.getTask().getTaskID() +                     ". Attempting to reuse: " + jvmRunner.jvmId);            return;          }          //Cases when a JVM is killed:           // (1) the JVM under consideration belongs to the same job           //     (passed in the argument). In this case, kill only when          //     the JVM ran all the tasks it was scheduled to run (in terms          //     of count).          // (2) the JVM under consideration belongs to a different job and is          //     currently not busy          //But in both the above cases, we see if we can assign the current          //task to an idle JVM (hence we continue the loop even on a match)          if ((jId.equals(jobId) && jvmRunner.ranAll()) ||              (!jId.equals(jobId) && !jvmRunner.isBusy())) {            runnerToKill = jvmRunner;            spawnNewJvm = true;          }        }      } else {        spawnNewJvm = true;      }      if (spawnNewJvm) {        if (runnerToKill != null) {          LOG.info("Killing JVM: " + runnerToKill.jvmId);          runnerToKill.kill();        }        spawnNewJvm(jobId, env, t);        return;      }      //*MUST* never reach this      throw new RuntimeException("Inconsistent state!!! " +      "JVM Manager reached an unstable state " +            "while reaping a JVM for task: " + t.getTask().getTaskID()+            " " + getDetails());    }

12 spawnNewJvm(jobId, env, t);

    private void spawnNewJvm(JobID jobId, JvmEnv env,          TaskRunner t) {      JvmRunner jvmRunner = new JvmRunner(env,jobId);      jvmIdToRunner.put(jvmRunner.jvmId, jvmRunner);      //spawn the JVM in a new thread. Note that there will be very little      //extra overhead of launching the new thread for a new JVM since      //most of the cost is involved in launching the process. Moreover,      //since we are going to be using the JVM for running many tasks,      //the thread launch cost becomes trivial when amortized over all      //tasks. Doing it this way also keeps code simple.      jvmRunner.setDaemon(true);      jvmRunner.setName("JVM Runner " + jvmRunner.jvmId + " spawned.");      setRunningTaskForJvm(jvmRunner.jvmId, t);      LOG.info(jvmRunner.getName());      jvmRunner.start();    }

13 jvmRunner.run()

      public void run() {        runChild(env);      }      public void runChild(JvmEnv env) {        try {          env.vargs.add(Integer.toString(jvmId.getId()));          List<String> wrappedCommand =             TaskLog.captureOutAndError(env.setup, env.vargs, env.stdout, env.stderr,                env.logSize, env.pidFile);          shexec = new ShellCommandExecutor(wrappedCommand.toArray(new String[0]),               env.workDir, env.env);          shexec.execute();        } catch (IOException ioe) {          // do nothing          // error and output are appropriately redirected        } finally { // handle the exit code          if (shexec == null) {            return;          }          int exitCode = shexec.getExitCode();          updateOnJvmExit(jvmId, exitCode, killed);          LOG.info("JVM : " + jvmId +" exited. Number of tasks it ran: " +               numTasksRan);          try {            // In case of jvm-reuse,            //the task jvm cleans up the common workdir for every             //task at the beginning of each task in the task JVM.            //For the last task, we do it here.            if (env.conf.getNumTasksToExecutePerJvm() != 1) {              FileUtil.fullyDelete(env.workDir);            }          } catch (IOException ie){}        }      }

14 shexec.execute();

    public void execute() throws IOException {      this.run();        }

  protected void run() throws IOException {    if (lastTime + interval > System.currentTimeMillis())      return;    exitCode = 0; // reset for next run    runCommand();  }

  private void runCommand() throws IOException {     ProcessBuilder builder = new ProcessBuilder(getExecString());    boolean completed = false;        if (environment != null) {      builder.environment().putAll(this.environment);    }    if (dir != null) {      builder.directory(this.dir);    }        process = builder.start();    final BufferedReader errReader =             new BufferedReader(new InputStreamReader(process                                                     .getErrorStream()));    BufferedReader inReader =             new BufferedReader(new InputStreamReader(process                                                     .getInputStream()));    final StringBuffer errMsg = new StringBuffer();        // read error and input streams as this would free up the buffers    // free the error stream buffer    Thread errThread = new Thread() {      @Override      public void run() {        try {          String line = errReader.readLine();          while((line != null) && !isInterrupted()) {            errMsg.append(line);            errMsg.append(System.getProperty("line.separator"));            line = errReader.readLine();          }        } catch(IOException ioe) {          LOG.warn("Error reading the error stream", ioe);        }      }    };    try {      errThread.start();    } catch (IllegalStateException ise) { }    try {      parseExecResult(inReader); // parse the output      // clear the input stream buffer      String line = inReader.readLine();      while(line != null) {         line = inReader.readLine();      }      // wait for the process to finish and check the exit code      exitCode = process.waitFor();      try {        // make sure that the error thread exits        errThread.join();      } catch (InterruptedException ie) {        LOG.warn("Interrupted while reading the error stream", ie);      }      completed = true;      if (exitCode != 0) {        throw new ExitCodeException(exitCode, errMsg.toString());      }    } catch (InterruptedException ie) {      throw new IOException(ie.toString());    } finally {      // close the input stream      try {        inReader.close();      } catch (IOException ioe) {        LOG.warn("Error while closing the input stream", ioe);      }      if (!completed) {        errThread.interrupt();      }      try {        errReader.close();      } catch (IOException ioe) {        LOG.warn("Error while closing the error stream", ioe);      }      process.destroy();      lastTime = System.currentTimeMillis();    }  }

0 0