kfs总结

来源：互联网发布：深圳java架构师培训编辑：程序博客网时间：2024/04/30 13:53

1.kfs的DiskIO

在chunkserver中，每一个chunk都用一个ChunInfoHandle表示,其中与文件的Io有关的成员变量为DisIO::FilePtr, 改指针所指的类型为

class File
{
public:
File()
: mQueuePtr(0),
mFileIdx(-1),
mReadOnlyFlag(false),
mSpaceReservedFlag(false)
{}
~File()
{
if (File::IsOpen()) {
File::Close();
}

}

...

该类是DiskIO类定义的内部类，其中的mQueuePtr所指的对象类型为DiskQueue, 该类负责实现异步磁盘IO的主要类。

class DiskQueue : public QCDiskQueue
{
public:
typedef QCDLList<DiskQueue, 0> DiskQueueList;

...

private:
std::string mFileNamePrefixes;
const unsigned long mDeviceId;
DiskQueue* mPrevPtr[1];
DiskQueue* mNextPtr[1];
...

friend class QCDLListOp<DiskQueue, 0>;
}

在系统中可以有多个DiskQueue, 根据chunk文件的文件名前缀不同，负责不同的文件的IO, 相当于将文件分组，各自负责不同组的IO。

系统的所有DiskQueue都保存在一个全局的指针sDiskIoQueuesPtr所指的DiskIoQueues对象的DiskQueueLIst 中，该全局指针的初始化在chunkserver启动初始化ChunkManager的时候，ChunkManage的Init函数中调用DiskIO::Init 静态函数完成。

class DiskIoQueues : private ITimeout
{
private:
typedef DiskQueue::DiskQueueList DiskQueueList;

public:
enum { kDiskQueueIdNone = -1 };
typedef QCDLList<DiskIo, 0> DoneQueue;

...

DiskIo* mIoQueuesPtr[1]; // 已完成的DiskIo的链表头
DiskQueue* mDiskQueuesPtr[1]; //DiskQueue链表头

每当有kfsOp要读写chunk时，就会用一个DiskIO来表示

class DiskIo : private QCDiskQueue::IoCompletion

{
private:
typedef std::vector<IOBufferData> IoBuffers;
/// Owning KfsCallbackObj.
KfsCallbackObj* const mCallbackObjPtr;
FilePtr mFilePtr;
QCDiskQueue::RequestId mRequestId;
IoBuffers mIoBuffers;
size_t mReadBufOffset;
size_t mReadLength;
ssize_t mIoRetCode;
QCDiskQueue::RequestId mCompletionRequestId;
QCDiskQueue::Error mCompletionCode;
DiskIo* mPrevPtr[1];
DiskIo* mNextPtr[1];

void RunCompletion();
void IoCompletion(
IOBuffer* inBufferPtr,
int inRetCode,
bool inSyncFlag = false);
virtual bool Done(
QCDiskQueue::RequestId inRequestId,
QCDiskQueue::FileIdx inFileIdx,
QCDiskQueue::BlockIdx inStartBlockIdx,
QCDiskQueue::InputIterator& inBufferItr,
int inBufferCount,
QCDiskQueue::Error inCompletionCode,
int inSysErrorCode,
int64_t inIoByteCount);

friend class QCDLListOp<DiskIo, 0>;
friend class DiskIoQueues;

从整体上来看，DiskIoQueues是整个磁盘IO的Manager，而负责具体的磁盘IO操作则是由具的FilePtr->mQueuePtr 的DiskQueue完成， DiskQueue只是对磁盘IO库qcdio/qcdiskqueue类进行了简单的封装，所有的请求都加入qcdiskqueue的请求队列中，有专门的磁盘IO线程来操作，操作完成时会执行传入的实现了IoCompletion接口定义的Done完成函数的对象，要强调的一点的是，这里IoCompletion接口定义的完成函数Done()是在磁盘IO线程中执行，而不是在网络IO线程中执行，因此不能直接传递Kfsop的回调函数，而是每次请求的DiskIo实现的Done函数，DiskIo的Done函数根据磁盘IO线程的返回值做一些相应的处理后，然后将自己加入到全局的sDiskIoQueuePtr所指的DiskIoQueues的DoneQueue中。而DiskIoQueues又实现了Timeout接口， Timeout函数负责从DoneQueue中弹出DiskIo, 执行其RunCompletion，DiskIO 的RunCompletion函数会执行与之关联的KfsOp的HandleEvent(),又回到了kfsOp，继续kfsOp的HandleEvent之旅。由于Timeout函数在网络Io线程中执行，这样就巧妙的实现了KfsOp所有操作都是在网络IO线程中执行，不会出现在其他的线程中，因此不需要锁同步。

2. kfs客户端

和客户端代码直接接触的主要有两个类，KfsClientFactory, KfsClient，其中KfsClientFactory采用单例模式，负责管理所有的KfsClient.

typedef boost::shared_ptr<KfsClient> KfsClientPtr;

class KfsClientFactory {
// Make the constructor private to get a Singleton.
KfsClientFactory();
~KfsClientFactory();

KfsClientFactory(const KfsClientFactory &other);
const KfsClientFactory & operator=(const KfsClientFactory &other);
KfsClientPtr mDefaultClient;
std::vector<KfsClientPtr> mClients;
static KfsClientFactory* sForGdbToFindInstance;
public:
static KfsClientFactory *Instance();
...

}

每个KfsClient负责和不同的Kfs server 连接， KfsClient的实现采用了桥接模式，具体的实现都交给了 KfsClientImpl *mImpl 完成。KfsClient对文件的读写进行了缓存，缓存的数据用ChunkBuffer来表示：

struct ChunkBuffer {
// set the client buffer to be fairly big...for sequential reads,
// we will hit the network few times and on each occasion, we read
// a ton and thereby get decent performance; having a big buffer
// obviates the need to do read-ahead :-)

ChunkBuffer():chunkno(-1), start(0), length(0), dirty(false), buf(NULL), bufsz(0) { }
~ChunkBuffer() { delete [] buf; }
void invalidate() {
chunkno = -1; start = 0; length = 0; dirty = false;
delete [] buf;
buf = 0;
}
void allocate() {
if (! buf && bufsz > 0) {
// XXX: align this to 16-byte boundary
// see IOBuffer.cc code
buf = new char[bufsz];
}
}
int chunkno; // which chunk
off_t start; // offset with chunk
size_t length; // length of valid data
bool dirty; // must flush to server if true
char *buf; // the data
size_t bufsz;
};

Kfs 支持预读功能，每个预读请求由PendingChunkRead表示

class PendingChunkRead
{
public:
enum { kMaxReadRequest = 1 << 20 };

PendingChunkRead(KfsClientImpl& impl, size_t readAhead);
~PendingChunkRead();
bool Start(int fd, size_t off);
ssize_t Read(char *buf, size_t numBytes);
bool IsValid() const { return (mFd >= 0); }
void Reset() { Start(-1, 0); }
void SetReadAhead(size_t readAhead) { mReadAhead = readAhead; }
size_t GetReadAhead() const { return mReadAhead; }
off_t GetChunkOffset() const { return (IsValid() ? mReadOp.offset : -1); }
private:
ReadOp mReadOp;
TcpSocket* mSocket;
KfsClientImpl& mImpl;
int mFd;
size_t mReadAhead;
};

每次读写文件的文件指针采用FilePosition 来表示

///
/// \brief Location of the file pointer in a file consists of two
/// parts: the offset in the file, which then translates to a chunk #
/// and an offset within the chunk. Also, for performance, we do some
/// client-side buffering (for both reads and writes). The buffer
/// stores data corresponding to the "current" chunk.
///
struct FilePosition {
FilePosition() {
fileOffset = chunkOffset = 0;
chunkNum = 0;
preferredServer = NULL;
pendingChunkRead = 0;
prefetchReq = NULL;
}
~FilePosition() {
delete pendingChunkRead;
delete prefetchReq;
}
void Reset() {
fileOffset = chunkOffset = 0;
chunkNum = 0;
chunkServers.clear();
preferredServer = NULL;
CancelPendingRead();
}

off_t fileOffset; // offset within the file
/// which chunk are we at: this is an index into fattr.chunkTable[]
int32_t chunkNum;
/// offset within the chunk
off_t chunkOffset;

/// transaction id info for record append
std::vector<WriteInfo> writeId;

std::vector<ChunkServerConn> chunkServers;

PendingChunkRead* pendingChunkRead;
/// for read prefetching
AsyncReadReq *prefetchReq;

...

每个打开的文件又FileTableEntry表示

///
/// \brief A table of entries that describe each open KFS file.
///
struct FileTableEntry {
// the fid of the parent dir in which this entry "resides"
kfsFileId_t parentFid;
// stores the name of the file/directory.
std::string name;

// store a pointer to the associated name-cache entry
// NameToFdMapIter pathCacheIter;

// the full pathname
std::string pathname;

// one of O_RDONLY, O_WRONLY, O_RDWR; when it is 0 for a file,
// this entry is used for attribute caching
int openMode;
FileAttr fattr;
std::map <int, ChunkAttr> cattr;
// the position in the file at which the next read/write will occur
FilePosition currPos;
/// the user has set a marker beyond which reads should return EOF
off_t eofMark;
/// For the current chunk, do some amount of buffering on the
/// client. This helps absorb network latencies for small
/// reads/writes.
ChunkBuffer buffer;
// for LRU reclamation of file table entries, track when this
// entry was last accessed
time_t lastAccessTime;
// directory entries are cached; ala NFS, keep the entries cached
// for a max of 30 secs; after that revalidate
time_t validatedTime;

bool skipHoles;
unsigned int instance;
int appendPending;
bool didAppend;

FileTableEntry(kfsFileId_t p, const char *n, unsigned int instance):
parentFid(p), name(n), eofMark(-1),
lastAccessTime(0), validatedTime(0),
skipHoles(false), instance(instance), appendPending(0),
didAppend(false) { }

};

所有的文件API都有KfsClientImpl 来实现：

///
/// The implementation object.
///
class KfsClientImpl {

private:
/// Maximum # of files a client can have open.
static const int MAX_FILES = 512000;

/// Primitive support for concurrent access in the KFS client: at
/// each entry point from the public interfaces, grab the mutex
/// before doing any work. This ensures that all requests to the
/// meta/chunk servers are serialized.
pthread_mutex_t mMutex;

/// Seed to the random number generator
unsigned mRandSeed;
bool mIsInitialized;
/// where is the meta server located
ServerLocation mMetaServerLoc;

LeaseClerk mLeaseClerk;

/// a tcp socket that holds the connection with the server
TcpSocket mMetaServerSock;
/// seq # that we send in each command
kfsSeq_t mCmdSeqNum;

/// The current working directory in KFS
std::string mCwd;

std::string mHostname;

/// keep a table of open files/directory handles.
std::vector <FileTableEntry *> mFileTable;
NameToFdMap mPathCache;

TelemetryClient mTelemetryReporter;
/// set of slow nodes as flagged by the telemetry service
std::vector<struct in_addr> mSlowNodes;
size_t mDefaultIoBufferSize;
size_t mDefaultReadAheadSize;
KfsPendingOp mPendingOp;

Asyncer mAsyncer;
std::vector<AsyncWriteReq *> mAsyncWrites;
unsigned int mFileInstance;
KfsProtocolWorker* mProtocolWorker;
int mMaxNumRetriesPerOp;
...

};

KfsClientImpl读写分为同步读写和异步读写，同步读写用KfsPendingOps开线程来实现，但是调用线程依然在不停的检测读写的状态，直到读写完成，异步读写由Asyncer实现。Asynser开了新线程，实现了Timeout接口，利用NetMnanager.mainLoop来实现读写，

在Timeout 函数中又为自己的写请求队列mWriteRequest，和读请求对列mReadRequest中的每个写请求AsyncWriteReq以及每个读请求 AsyncReadReq分别创建了AsyncWriteWorker 和 AsyncReadWorker, AsyncWriteWorker和AsyncReadWorker各自创建了一个网络连接，分别加入到mainLoop中。

append操作是通过KfsProtocolWorker来实现的，与Asyncer类似，改对象自己开了一个线程，也用NetManager,mainLoop来实现网络IO。