hdfs的C接口libhdfs的测试

来源：互联网发布：网络格斗3d游戏编辑：程序博客网时间：2024/05/21 07:08

libhdfs is a JNI based C api for Hadoop’s DFS.
It provides C apis to a subset of the HDFS APIs to manipulate DFS files and the filesystem.
libhdfs is part of the hadoop distribution and comes pre-compiled in ${HADOOP_HOME}/libhdfs/libhdfs.so .

API简介
    建立、关闭与HDFS连接：hdfsConnect()、hdfsConnectAsUser()、hdfsDisconnect()。hdfsConnect()实际上是直接调用hdfsConnectAsUser。
    打开、关闭HDFS文件：hdfsOpenFile()、hdfsCloseFile()。当用hdfsOpenFile()创建文件时，可以指定replication和blocksize参数。写打开一个文件时，隐含O_TRUNC标志，文件会被截断，写入是从文件头开始的。
    读HDFS文件：hdfsRead()、hdfsPread()。两个函数都有可能返回少于用户要求的字节数，此时可以再次调用这两个函数读入剩下的部分（类似APUE中的readn实现）；只有在两个函数返回零时，我们才能断定到了文件末尾。
    写HDFS文件：hdfsWrite()。HDFS不支持随机写，只能是从文件头顺序写入。
    查询HDFS文件信息：hdfsGetPathInfo()
    查询和设置HDFS文件读写偏移量：hdfsSeek()、hdfsTell()
    查询数据块所在节点信息：hdfsGetHosts()。返回一个或多个数据块所在数据节点的信息，一个数据块可能存在多个数据节点上。
    libhdfs中的函数是通过jni调用JAVA虚拟机，在虚拟机中构造对应的HDFS的JAVA类，然后反射调用该类的功能函数。总会发生JVM和程序之间内存拷贝的动作，性能方面值得注意。
    HDFS不支持多个客户端同时写入的操作，无文件或是记录锁的概念。
    建议只有超大文件才应该考虑放在HDFS上，而且最好对文件的访问是写一次，读多次。小文件不应该考虑放在HDFS上，得不偿失！

测试1：

#include “hdfs.h”int main(int argc, char **argv) {    if (argc != 4) {        fprintf(stderr, “Usage: hdfs_read \n”);    exit(-1);    }    hdfsFS fs = hdfsConnect(“default”, 0);    if (!fs) {        fprintf(stderr, “Oops! Failed to connect to hdfs!\n”);        exit(-1);    }    const char* rfile = argv[1];    tSize fileTotalSize = strtoul(argv[2], NULL, 10);    tSize bufferSize = strtoul(argv[3], NULL, 10);    hdfsFile readFile = hdfsOpenFile(fs, rfile, O_RDONLY, bufferSize, 0, 0);    if (!readFile) {        fprintf(stderr, “Failed to open %s for writing!\n”, rfile);        exit(-2);    }    // data to be written to the file    char* buffer = malloc(sizeof(char) * bufferSize);    if(buffer == NULL) {        return -2;    }    // read from the file    tSize curSize = bufferSize;    for (; curSize == bufferSize;) {        curSize = hdfsRead(fs, readFile, (void*)buffer, curSize);    }    free(buffer);    hdfsCloseFile(fs, readFile);    hdfsDisconnect(fs);    return 0;}

这段代码是从hdfs或者本地读文件。
hdfsFS fs = hdfsConnect(“default”, 0); 这个是默认的连接方式，是从本地读数据
hdfsFS fs = hdfsConnect(“192.168.0.128″, 9000); 这个是连接指定的hdfs

编译命令如下：
gcc hdfs_read.c -I${HADOOP_HOME}/src/c++/libhdfs -I${JAVA_HOME}/include -I${JAVA_HOME}/include/linux -L${HADOOP_HOME}/libhdfs -lhdfs -o hdfs_read

[hadoop@hadoop00 sunwg]$ ./hdfs_read /sunwg/1.txt 4 10
tom
mary

测试2：

#include “hdfs.h”int main(int argc, char **argv) {    if (argc != 4) {        fprintf(stderr, “Usage: hdfs_write \n”);        exit(-1);    }    hdfsFS fs = hdfsConnect(“default”, 0);    if (!fs) {        fprintf(stderr, “Oops! Failed to connect to hdfs!\n”);        exit(-1);    }    const char* writeFileName = argv[1];    tSize fileTotalSize = strtoul(argv[2], NULL, 10);    tSize bufferSize = strtoul(argv[3], NULL, 10);    hdfsFile writeFile = hdfsOpenFile(fs, writeFileName, O_WRONLY, bufferSize, 0, 0);    if (!writeFile) {        fprintf(stderr, “Failed to open %s for writing!\n”, writeFileName);        exit(-2);    }    // data to be written to the file    char* buffer = malloc(sizeof(char) * bufferSize);    if(buffer == NULL) {        return -2;    }    int i = 0;    for (i=0; i < bufferSize; ++i) {        buffer[i] = 'a' + (i%26);    }    // write to the file    tSize nrRemaining;    for (nrRemaining = fileTotalSize; nrRemaining > 0; nrRemaining -= bufferSize ) {        int curSize = ( bufferSize < nrRemaining ) ? bufferSize : (int)nrRemaining;        hdfsWrite(fs, writeFile, (void*)buffer, curSize);    }    free(buffer);    hdfsCloseFile(fs, writeFile);    hdfsDisconnect(fs);    return 0;}

这段代码是向hdfs或者本地写文件。
hdfsFS fs = hdfsConnect("default", 0); 这个是默认的连接方式，是向本地写数据
hdfsFS fs = hdfsConnect("192.168.0.128", 9000); 这个是连接指定的hdfs

gcc hdfs_write.c -I${HADOOP_HOME}/src/c++/libhdfs -I${JAVA_HOME}/include -I${JAVA_HOME}/include/linux -L${HADOOP_HOME}/libhdfs -lhdfs -o hdfs_write

所以的函数的详细介绍可以看hdfs.h这个头文件。
注意：在执行过程可能会报一些java包的错误，不要忘记把下面两个目录加到classpath中。
/home/hadoop/hadoop/*.jar
/home/hadoop/hadoop/lib/*.jar