hdfs sync的同步代码重构

来源:互联网 发布:钟秀勇2017知产讲义pdf 编辑:程序博客网 时间:2024/06/09 12:57

这个版本还是有性能问题,由于size比较相同会将文件以流的形式去比较md5,这个代价太高,如果重复同步一个文件,那么时间会一直很高,所以重构了一个新的版本,在sync 再次重构

原文在hdfs file md5 计算,实现本地与hdfs同步文件

啥话都不说,直接上代码:

主要是几个问题:
第一个是slf4j的{},第二个是needUpdate方法的重写,第三个是异常的处理,第四个是sameFile,先比较size,再比较md5,这样资源消耗小

import java.io.{File, FileInputStream, IOException}import org.apache.commons.codec.digest.DigestUtilsimport org.apache.hadoop.conf.Configurationimport org.apache.hadoop.fs.{FileSystem, Path}import org.slf4j.LoggerFactory/**  * Created by todd.chen on 16/3/15.  * email : todd.chen@ximalaya.com  */object PathSyncer {  lazy val logger = LoggerFactory.getLogger(this.getClass)  type IsSameFile = (Boolean, Boolean, Boolean)  @throws(classOf[IOException])  def sync(localFile: File, hdfsPath: Path, configuration: Configuration): Unit = {    implicit val fileSystem = FileSystem.get(configuration)    sync(localFile, hdfsPath)  }  @throws(classOf[IOException])  def sync(hdfsPath: Path, localFile: File)(implicit configuration: Configuration): Unit = {    implicit val fileSystem = FileSystem.get(configuration)    sync(localFile, hdfsPath)  }  private def sync(localFile: File, hdfsPath: Path)(implicit fileSystem: FileSystem): Unit = {    val fileCheck: IsSameFile = (localFile.isFile, fileSystem.isFile(hdfsPath),      sameFile(localFile, hdfsPath))    fileCheck match {      case (true, true, true) ⇒ logger.info(s"the file : {} in local and hdfs are same one", localFile.getName)      case (true, true, false) ⇒        logger.debug(s"the file: {} in local and hdfs have same name,but they are different file",          localFile.getName)        fileSystem.copyFromLocalFile(false, true, new Path(localFile.toURI), hdfsPath)      case (true, false, _) ⇒        logger.debug(s"the file: {} in local is file and in hdfs is dir", localFile.getName)        throw new IllegalArgumentException(s"${localFile.getName} in local is file and in hdfs is dir")      case (false, true, _) ⇒        logger.debug(s"in local {} is a dir and in hdfs is a file", localFile.getName)        throw new IllegalArgumentException(s"${localFile.getName} in local is dir and in hdfs is file")      case (false, false, _) ⇒        logger.debug(s"both local and hdfs this is dir:{}", localFile.getName)        //three list ,which need update ,which need delete ,which need update        syncChildren(localFile, hdfsPath)        val childrenDir = localFile.listFiles().filter(_.isDirectory)        childrenDir.foreach(file ⇒ sync(file, new Path(hdfsPath, file.getName)))    }  }  private def syncChildren(localFile: File, hdfsPath: Path)(implicit fileSystem: FileSystem) = {    val deleteList = needDelete(localFile, hdfsPath)    val uploadList = needUpload(localFile, hdfsPath)    val updateList = needUpdate(localFile, hdfsPath)    val localParentMappingHdfs = new Path(localFile.toURI)    logger.debug("deleting which file need delete")    deleteList.foreach(name ⇒ fileSystem.delete(new Path(hdfsPath, name), true))    logger.debug("deleted and uploading which file need upload or update")    (updateList ++ uploadList).foreach(child ⇒ fileSystem.copyFromLocalFile(false, true,      new Path(localParentMappingHdfs, child), new Path(hdfsPath, child)))    logger.debug("uploaded")  }  private def needDelete(localFile: File, hdfsPath: Path)(implicit fileSystem: FileSystem) = {    fileSystem.listStatus(hdfsPath)      .map(_.getPath.getName).diff(localFile.listFiles().map(_.getName)).toList  }  private def needUpload(localFile: File, hdfsPath: Path)(implicit fileSystem: FileSystem) = {    localFile.listFiles().filter(_.isFile).map(_.getName).diff(      fileSystem.listStatus(hdfsPath).filter(_.isFile).map(_.getPath.getName)).toList  }  private def needUpdate(localFile: File, hdfsPath: Path)(implicit fileSystem: FileSystem) = {    val intersectNameList = fileSystem.listStatus(hdfsPath).filter(_.isFile).map(_.getPath.getName)      .intersect(localFile.listFiles().filter(_.isFile).map(_.getName))    intersectNameList.filter(name ⇒ !sameFile(new File(localFile, name), new Path(hdfsPath, name)))  }  private def sameFile(file: File, path: Path)(implicit fileSystem: FileSystem): Boolean = {    val dfsStatus = fileSystem.getFileStatus(path)    file.isFile && dfsStatus.isFile &&      (file.length() == dfsStatus.getLen) &&      (getHdfsFileMd5(dfsStatus.getPath) == getLocalFileMd5(file))  }  @throws(classOf[IOException])  private[sync] def getHdfsFileMd5(path: Path)(implicit dfs: FileSystem): String = {    val in = dfs.open(path)    try {      val md5 = DigestUtils.md5Hex(in)      md5    } catch {      case e: IOException ⇒ throw e    } finally {      try {        in.close()      } catch {        case e: IOException ⇒ throw e      }    }  }  @throws(classOf[IOException])  private[sync] def getLocalFileMd5(file: File): String = {    val in = new FileInputStream(file)    try {      val md5 = DigestUtils.md5Hex(in)      md5    } catch {      case e: IOException ⇒ throw e    } finally {      try {        in.close()      } catch {        case e: IOException ⇒ throw e      }    }  }}
1 0