拉取hadoop集群上的hdfs文件
来源:互联网 发布:张起灵 知乎 编辑:程序博客网 时间:2024/06/03 22:39
从hadoop集群拉取hdfs文件是一个常见的需求,基于org.apache.hadoop即可做到。
但是hadoop包有个明显的缺点是引用太多,经常需要排包,包括但不限于httpclient,servlet,slf4j,tomcat等等
@Servicepublic class HdfsClient{ private static final Logger logger = LoggerFactory.getLogger(HdfsClient.class); private FileSystem fileSystem; private Configuration conf; public synchronized void init() throws Exception { String proxy = "x.x.x.x:x"; String username = "xxx"; boolean useProxy = false; conf = new Configuration(); conf.set("fs.defaultFS", "hdfs://argo"); conf.set("dfs.web.ugi", "hdfs,hadoop"); conf.set("dfs.nameservices", "argo"); conf.set("dfs.ha.namenodes.argo", "nn1,nn2"); conf.set("dfs.namenode.rpc-address.argo.nn1", "xxx:x"); conf.set("dfs.namenode.rpc-address.argo.nn2", "xxx:x"); conf.set("dfs.client.failover.proxy.provider.argo", "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider"); System.setProperty("HADOOP_USER_NAME", username); if (useProxy) { conf.set("hadoop.socks.server", proxy); conf.set("hadoop.rpc.socket.factory.class.default", "org.apache.hadoop.net.SocksSocketFactory"); conf.set("dfs.client.use.legacy.blockreader", "true"); } this.fileSystem = FileSystem.get(conf); logger.info("init hdfs client success,proxy=" + proxy + ",username=" + username + ",useProxy=" + useProxy); } /** * 拉取无分区数据 * @param remotePath * @param localPath */ public void pullHdfs(String remotePath, String localPath) throws Exception{ if (!remotePath.endsWith(File.separator)) { remotePath = remotePath + File.separator; } if (!localPath.endsWith(File.separator)) { localPath = localPath + File.separator; } StopWatch time = new StopWatch(); time.start(); File file = new File(localPath); deleteFile(file); pullData(remotePath,localPath); logger.info("pull {} to {} success! size={} time={}", remotePath, localPath, getDirSize(file), time.getTime()); time.stop(); } /** * 计算文件夹大小 * @param file * @return */ private long getDirSize(final File file) { if (file.isFile()) return file.length(); final File[] children = file.listFiles(); long total = 0; if (children != null) for (final File child : children) total += getDirSize(child); return total; } /** * 删除老数据整个文件路径 * * @param file */ private void deleteFile(File file) { if (file.exists()) { if (file.isFile()) { file.delete(); } else if (file.isDirectory()) { File files[] = file.listFiles(); for (int i = 0; i < files.length; i++) { this.deleteFile(files[i]); } } file.delete(); } } /** * 拉取远程数据到本地 * @param remotePath * @param localPath * @throws Exception */ private void pullData(String remotePath, String localPath) throws Exception { int tryNum = 1; if (StringUtils.isBlank(remotePath) || StringUtils.isBlank(localPath)) { logger.error("Invalid Path!"); throw new Exception("Invalid Path!"); } do { try { if (hdfsExist(remotePath)) { hdfsPull(remotePath, localPath); File file = new File(localPath); deleteCrcChecksum(file); } } catch (Exception e) { logger.error("error@checkData,remotePath=" + remotePath + ",localPath=" + localPath + ",tryNum=" + tryNum + ",ex={}", e); tryNum++; } } while (tryNum >1 && tryNum <4); if(tryNum == 4) { throw new Exception("fail to get " + remotePath + " after 3 times try"); } } /** * 删除crc和success文件 * @param file */ private void deleteCrcChecksum(File file) { if (file.exists()) { if (file.isFile()) { if(file.getName().toLowerCase().endsWith(".crc") || file.getName().toLowerCase().endsWith("_success")) { file.delete(); } } else if (file.isDirectory()) { File files[] = file.listFiles(); for (int i = 0; i < files.length; i++) { this.deleteCrcChecksum(files[i]); } } } } /** * 判断远程文件是否存在 * @param dfsPath * @return * @throws IOException */ private boolean hdfsExist(final String dfsPath) throws IOException { return fileSystem.exists(new Path(dfsPath)); } /** * 拉取远程文件 * @param dfsPath * @param localPath * @throws IOException */ private void hdfsPull(final String dfsPath, final String localPath) throws IOException { try { fileSystem.copyToLocalFile(new Path(dfsPath), new Path(localPath)); } catch (Exception e) { logger.error("Exception@HdfsClient, dfsPath=" + dfsPath + ", localPath=" + localPath, e); } }}
0 0
- 拉取hadoop集群上的hdfs文件
- 初见Hadoop—- 搭建MyEclipse 访问HDFS 上的文件
- java hadoop hdfs 上写文件
- Hadoop集群之HDFS
- HDFS--hadoop集群
- hadoop的HDFS文件存储
- Hadoop HDFS Explorer连接Windows上的HDFS
- 读取hdfs上的文件
- hadoop的hdfs文件操作实现上传文件到hdfs
- hadoop集群hdfs磁盘划分
- 配置spark+hadoop(hdfs)集群
- Hadoop的hdfs常用的文件命令
- [Hadoop培训笔记]02-HDFS集群的安装与部署
- 为已存在的Hadoop集群配置HDFS Federation
- Hadoop的HDFS文件存储实现机制
- hadoop的hdfs文件常用命令操作
- Hadoop HDFS文件操作的Java代码
- Hadoop HDFS文件操作的Java代码
- 机器学习基本概念(三)
- 智能手机双摄像头工作原理详解:RBG +RGB, RGB + Mono
- 53. Maximum Subarray
- InnoDB存储引擎(一)
- 蓝桥杯 2015 决赛 3 显示二叉树
- 拉取hadoop集群上的hdfs文件
- 九、用图讲解MapReduce Shuffle 过程
- 二分法排序
- 金融技术,也算技术哈?
- static的作用
- BP神经网络(完整的理论和经验公式)
- 解决this.getHibernateTemplate()的空指针异常的问题
- UCOS-II移植 os_cpu.h文件详解
- c#web窗体登录界面登录注册以及密码找回发送邮箱功能