hdfs sync的同步代码重构
来源:互联网 发布:钟秀勇2017知产讲义pdf 编辑:程序博客网 时间:2024/06/09 12:57
这个版本还是有性能问题,由于size比较相同会将文件以流的形式去比较md5,这个代价太高,如果重复同步一个文件,那么时间会一直很高,所以重构了一个新的版本,在sync 再次重构
原文在hdfs file md5 计算,实现本地与hdfs同步文件
啥话都不说,直接上代码:
主要是几个问题:
第一个是slf4j的{},第二个是needUpdate方法的重写,第三个是异常的处理,第四个是sameFile,先比较size,再比较md5,这样资源消耗小
import java.io.{File, FileInputStream, IOException}import org.apache.commons.codec.digest.DigestUtilsimport org.apache.hadoop.conf.Configurationimport org.apache.hadoop.fs.{FileSystem, Path}import org.slf4j.LoggerFactory/** * Created by todd.chen on 16/3/15. * email : todd.chen@ximalaya.com */object PathSyncer { lazy val logger = LoggerFactory.getLogger(this.getClass) type IsSameFile = (Boolean, Boolean, Boolean) @throws(classOf[IOException]) def sync(localFile: File, hdfsPath: Path, configuration: Configuration): Unit = { implicit val fileSystem = FileSystem.get(configuration) sync(localFile, hdfsPath) } @throws(classOf[IOException]) def sync(hdfsPath: Path, localFile: File)(implicit configuration: Configuration): Unit = { implicit val fileSystem = FileSystem.get(configuration) sync(localFile, hdfsPath) } private def sync(localFile: File, hdfsPath: Path)(implicit fileSystem: FileSystem): Unit = { val fileCheck: IsSameFile = (localFile.isFile, fileSystem.isFile(hdfsPath), sameFile(localFile, hdfsPath)) fileCheck match { case (true, true, true) ⇒ logger.info(s"the file : {} in local and hdfs are same one", localFile.getName) case (true, true, false) ⇒ logger.debug(s"the file: {} in local and hdfs have same name,but they are different file", localFile.getName) fileSystem.copyFromLocalFile(false, true, new Path(localFile.toURI), hdfsPath) case (true, false, _) ⇒ logger.debug(s"the file: {} in local is file and in hdfs is dir", localFile.getName) throw new IllegalArgumentException(s"${localFile.getName} in local is file and in hdfs is dir") case (false, true, _) ⇒ logger.debug(s"in local {} is a dir and in hdfs is a file", localFile.getName) throw new IllegalArgumentException(s"${localFile.getName} in local is dir and in hdfs is file") case (false, false, _) ⇒ logger.debug(s"both local and hdfs this is dir:{}", localFile.getName) //three list ,which need update ,which need delete ,which need update syncChildren(localFile, hdfsPath) val childrenDir = localFile.listFiles().filter(_.isDirectory) childrenDir.foreach(file ⇒ sync(file, new Path(hdfsPath, file.getName))) } } private def syncChildren(localFile: File, hdfsPath: Path)(implicit fileSystem: FileSystem) = { val deleteList = needDelete(localFile, hdfsPath) val uploadList = needUpload(localFile, hdfsPath) val updateList = needUpdate(localFile, hdfsPath) val localParentMappingHdfs = new Path(localFile.toURI) logger.debug("deleting which file need delete") deleteList.foreach(name ⇒ fileSystem.delete(new Path(hdfsPath, name), true)) logger.debug("deleted and uploading which file need upload or update") (updateList ++ uploadList).foreach(child ⇒ fileSystem.copyFromLocalFile(false, true, new Path(localParentMappingHdfs, child), new Path(hdfsPath, child))) logger.debug("uploaded") } private def needDelete(localFile: File, hdfsPath: Path)(implicit fileSystem: FileSystem) = { fileSystem.listStatus(hdfsPath) .map(_.getPath.getName).diff(localFile.listFiles().map(_.getName)).toList } private def needUpload(localFile: File, hdfsPath: Path)(implicit fileSystem: FileSystem) = { localFile.listFiles().filter(_.isFile).map(_.getName).diff( fileSystem.listStatus(hdfsPath).filter(_.isFile).map(_.getPath.getName)).toList } private def needUpdate(localFile: File, hdfsPath: Path)(implicit fileSystem: FileSystem) = { val intersectNameList = fileSystem.listStatus(hdfsPath).filter(_.isFile).map(_.getPath.getName) .intersect(localFile.listFiles().filter(_.isFile).map(_.getName)) intersectNameList.filter(name ⇒ !sameFile(new File(localFile, name), new Path(hdfsPath, name))) } private def sameFile(file: File, path: Path)(implicit fileSystem: FileSystem): Boolean = { val dfsStatus = fileSystem.getFileStatus(path) file.isFile && dfsStatus.isFile && (file.length() == dfsStatus.getLen) && (getHdfsFileMd5(dfsStatus.getPath) == getLocalFileMd5(file)) } @throws(classOf[IOException]) private[sync] def getHdfsFileMd5(path: Path)(implicit dfs: FileSystem): String = { val in = dfs.open(path) try { val md5 = DigestUtils.md5Hex(in) md5 } catch { case e: IOException ⇒ throw e } finally { try { in.close() } catch { case e: IOException ⇒ throw e } } } @throws(classOf[IOException]) private[sync] def getLocalFileMd5(file: File): String = { val in = new FileInputStream(file) try { val md5 = DigestUtils.md5Hex(in) md5 } catch { case e: IOException ⇒ throw e } finally { try { in.close() } catch { case e: IOException ⇒ throw e } } }}
1 0
- hdfs sync的同步代码重构
- hdfs sync 再次重构
- 在vs code中使用ftp-sync插件实现客户端与服务器端代码的同步
- 在vs code中使用ftp-sync插件实现客户端与服务器端代码的同步
- firefox sync 同步失败问题的解决方法
- go sync的并发同步简单用法
- 关于sync同步块的个人理解
- 同步代码 重启步骤
- HBase, HDFS and durable sync
- Firefox Sync账户同步
- sync同步命令
- sync 数据同步
- sync同步指令
- 代码重构的时机
- 重构自己的代码
- 代码重构的介绍
- 软件代码的重构
- 代码重构的原则
- 移动端H5页面高清多屏适配方案
- 成为JavaGC专家(2)
- Android Java与C++的调用
- oracle中如何更改order by的默认排序?
- VBA学习笔记(1)
- hdfs sync的同步代码重构
- Android开发_控制硬加速hardwareAccelerated
- Oracle表空间、段、区和块简述
- 【小镇的技术天梯】Fast-CGI和php-fpm之间的关系
- 磁盘及文件系统管理(分区,挂载,卸载,信息展示与统计)
- 哈希冲突之开链法
- Spark机器学习读书笔记
- copy和strong
- 成为Java GC专家(3)