分块方式

来源:互联网 发布:linux安装ssh服务 编辑:程序博客网 时间:2024/05/16 10:44

不同的分块方式处理

基于内容的分块

 CDC(content-defined chunking)算法是一种变长分块算法,它应用数据指纹(如Rabin指纹)将文件分割成长度大小不等的分块策略。与定长分块算法不同,它是基于文件内容进行数据块切分的,因此数据块大小是可变化的。算法执行过程中,CDC使用一个固定大小(如48字节)的滑动窗口对文件数据计算数据指纹。如果指纹满足某个条件,如当它的值模特定的整数等于预先设定的数时,则把窗口位置作为块的边界。CDC算法可能会出现病态现象,即指纹条件不能满足,块边界不能确定,导致数据块过大。实现中可以对数据块的大小进行限定,设定上下限,解决这种问题。CDC算法对文件内容变化不敏感,插入或删除数据只会影响到检少的数据块,其余数据块不受影响。CDC算法也是有缺陷的,数据块大小的确定比较困难,粒度太细则开销太大,粒度过粗则dedup效果不佳。如何两者之间权衡折衷,这是一个难点。Deduputil中CDC分块算法代码如下。
 

参考http://blog.csdn.net/liuaigui/article/details/5829083

/* * content-defined chunking:基于内容的分块 * 1. BLOCK_MIN_SIZE <= block_size <= BLOCK_MAX_SIZE * 2. hash(block) % d == r */static int file_chunk_cdc(int fd, int fd_ldata, int fd_bdata, unsigned int *pos, unsigned int *block_num,    block_id_t **metadata, hashtable *htable, char *last_block_buf, unsigned int *last_block_len){    char buf[BUF_MAX_SIZE] = {0};    char buf_bz[BUF_MAX_SIZE] = {0};    char block_buf[BLOCK_MAX_SIZE * 2] = {0};    char win_buf[BLOCK_WIN_SIZE + 1] = {0};    char md5_str[33] = {0};    char adler_pre_char;    unsigned char md5_checksum[32 + 1] = {0};    unsigned int bpos = 0;    unsigned int rwsize = 0, bzsize = 0;    unsigned int exp_rwsize = BUF_MAX_SIZE;    unsigned int head, tail;    unsigned int block_sz = 0, old_block_sz = 0;    unsigned int hkey = 0;    int ret = 0;    while(rwsize = read(fd, buf + bpos, exp_rwsize))    {        /* last chunk @@@@@@*/        if ((rwsize + bpos + block_sz) < BLOCK_MIN_SIZE)            break;        head = 0;        tail = bpos + rwsize;        /* avoid unnecessary computation and comparsion */        if (block_sz < (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE))            old_block_sz = block_sz;            block_sz = ((block_sz + tail - head) > (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE)) ?                     BLOCK_MIN_SIZE - BLOCK_WIN_SIZE : block_sz + tail -head;              memcpy(block_buf + old_block_sz, buf + head, block_sz - old_block_sz);            head += (block_sz - old_block_sz);        }        while ((head + BLOCK_WIN_SIZE) <= tail)        {            memcpy(win_buf, buf + head, BLOCK_WIN_SIZE);            /*             * Firstly, i think rabinhash is the best. However, it's performance is very bad.             * After some testing, i found ELF_hash is better both on performance and dedup rate.             * So, EFL_hash is default. Now, adler_hash as default.             */            if (g_rolling_hash)            {                hkey = (block_sz == (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE)) ? adler32_checksum(win_buf, BLOCK_WIN_SIZE) :                    adler32_rolling_checksum(hkey, BLOCK_WIN_SIZE, adler_pre_char, buf[head+BLOCK_WIN_SIZE-1]);            }            else                 hkey = g_cdc_chunk_hashfunc(win_buf);            /* get a normal chunk */            if ((hkey % g_block_size) == CHUNK_CDC_R)            {                memcpy(block_buf + block_sz, buf + head, BLOCK_WIN_SIZE);                head += BLOCK_WIN_SIZE;                block_sz += BLOCK_WIN_SIZE;                if (block_sz >= BLOCK_MIN_SIZE)                {                    /* compress block is -z flag is given */                    if (g_bz)                     {                        bzsize = BUF_MAX_SIZE;                        if (Z_OK != zlib_compress_block(block_buf, block_sz, buf_bz, &bzsize))                        {                            ret = -1;                            goto _FILE_CHUNK_CDC_EXIT;                        }                        memcpy(block_buf, buf_bz, bzsize);                        block_sz = bzsize;                    }                    md5(block_buf, block_sz, md5_checksum);                    md5_2_str(md5_checksum);                    if (0 != (ret = dedup_regfile_block_process(block_buf, block_sz,                         md5_checksum, fd_ldata, fd_bdata, pos, block_num, metadata, htable)))                    {                        perror("dedup_reggile_block_process in file_chunk_cdc");                        goto _FILE_CHUNK_CDC_EXIT;                    }                    block_sz = 0;                }            }            else             {                block_buf[block_sz++] = buf[head++];                /* get an abnormal chunk */                if (block_sz >= BLOCK_MAX_SIZE)                {                    /* compress block if -z flag is given */                    if (g_bz)                    {                        bzsize = BUF_MAX_SIZE;                        if (Z_OK != zlib_compress_block(block_buf, block_sz, buf_bz, &bzsize))                        {                            ret = -1;                            goto _FILE_CHUNK_CDC_EXIT;                        }                        memcpy(block_buf, buf_bz, bzsize);                        block_sz = bzsize;                    }                    md5(block_buf, block_sz, md5_checksum);                    md5_2_str(md5_checksum);                    if (0 != (ret = dedup_regfile_block_process(block_buf, block_sz,                         md5_checksum, fd_ldata, fd_bdata, pos, block_num, metadata, htable)))                    {                        perror("dedup_reggile_block_process in file_chunk_cdc");                        goto _FILE_CHUNK_CDC_EXIT;                    }                    block_sz = 0;                }            }            /* avoid unnecessary computation and comparsion */            if (block_sz == 0)            {                block_sz = ((tail - head) > (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE)) ?                     BLOCK_MIN_SIZE - BLOCK_WIN_SIZE : tail - head;                memcpy(block_buf, buf + head, block_sz);                head = ((tail - head) > (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE)) ?                     head + (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE) : tail;            }            adler_pre_char = buf[head -1];        }        /* read expected data from file to full up buf */        bpos = tail - head;        exp_rwsize = BUF_MAX_SIZE - bpos;        adler_pre_char = buf[head -1];        memmove(buf, buf + head, bpos);    }    /* last chunk */    *last_block_len = ((rwsize + bpos + block_sz) >= 0) ? rwsize + bpos + block_sz : 0;    if (*last_block_len > 0)    {        memcpy(last_block_buf, block_buf, block_sz);        memcpy(last_block_buf + block_sz, buf, rwsize + bpos);    }_FILE_CHUNK_CDC_EXIT:    return ret;}

滑动分块

/* * slideing block chunking, performance is a big issue due to too many hash lookup. * *滑动分块,性能是一个大问题,由于太多的哈希查找。 */static int file_chunk_sb(int fd, int fd_ldata, int fd_bdata, unsigned int *pos, unsigned int *block_num,         block_id_t **metadata, hashtable *htable, char *last_block_buf, unsigned int *last_block_len){    char buf[BUF_MAX_SIZE] = {0};    char buf_bz[BUF_MAX_SIZE] = {0};    char win_buf[BLOCK_MAX_SIZE * 2] = {0};    char block_buf[BLOCK_MAX_SIZE * 2] = {0};    char adler_pre_char;    unsigned char md5_checksum[32 + 1] = {0};    unsigned char md5_checksum1[32 + 1] = {0};    unsigned char crc_checksum[16] = {0};    unsigned int bpos = 0;    unsigned int slide_sz = 0;    unsigned int rwsize = 0, bzsize = 0, bzsize_f = 0;    unsigned int exp_rwsize = BUF_MAX_SIZE;    unsigned int head, tail;    unsigned int hkey = 0;    unsigned int bflag = 0;    int ret = 0;    while(rwsize = read(fd, buf + bpos, exp_rwsize))    {        /* last chunk */        if ((rwsize + bpos + slide_sz) < g_block_size)            break;        head = 0;        tail = bpos + rwsize;        while ((head + g_block_size) <= tail)        {            memcpy(win_buf, buf + head, g_block_size);            hkey = (slide_sz == 0) ? adler32_checksum(win_buf, g_block_size) :                 adler32_rolling_checksum(hkey, g_block_size, adler_pre_char, buf[head+g_block_size-1]);            uint_2_str(hkey, crc_checksum);            /* bflag: 0, both CRC and MD5 are not idenitical                      1, both CRC and MD5 are identical                  2, CRC is identical and MD5 is not             */            bflag = 0;            /* this block maybe is duplicate */            bzsize = g_block_size;            if (hash_exist(g_sb_htable_crc, crc_checksum))            {                   bflag = 2;                /* compress block if -z flag is given */                if (g_bz)                 {                    bzsize = BUF_MAX_SIZE;                    if (Z_OK != zlib_compress_block(win_buf, g_block_size, buf_bz, &bzsize))                    {                        ret = -1;                        goto _FILE_CHUNK_SB_EXIT;                    }                    memcpy(win_buf, buf_bz, bzsize);                }                md5(win_buf, bzsize, md5_checksum);                md5_2_str(md5_checksum);                if (hash_exist(htable, md5_checksum))                {                    /* insert fragment */                    if (slide_sz != 0)                    {                        /* compress block if -z flag is given */                        if (g_bz)                        {                            bzsize_f = BUF_MAX_SIZE;                            if (Z_OK != zlib_compress_block(block_buf, slide_sz, buf_bz, &bzsize_f))                            {                                ret = -1;                                goto _FILE_CHUNK_SB_EXIT;                            }                            memcpy(block_buf, buf_bz, bzsize_f);                            slide_sz = bzsize_f;                        }                        md5(block_buf, slide_sz, md5_checksum1);                        md5_2_str(md5_checksum1);                        if (0 != (ret = dedup_regfile_block_process(block_buf, slide_sz, md5_checksum1,                             fd_ldata, fd_bdata, pos, block_num, metadata, htable)))                        {                            perror("dedup_regfile_block_process in file_chunk_sb");                            goto _FILE_CHUNK_SB_EXIT;                        }                    }                    /* insert fixed-size block */                    if (0 != (ret = dedup_regfile_block_process(win_buf, bzsize, md5_checksum,                         fd_ldata, fd_bdata, pos, block_num, metadata, htable)))                    {                        perror("dedup_regfile_block_process in file_chunk_sb");                        goto _FILE_CHUNK_SB_EXIT;                    }                    head += g_block_size;                    slide_sz = 0;                    bflag = 1;                }            }            /* this block is not duplicate */            if (bflag != 1)            {                block_buf[slide_sz++] = buf[head++];                if (slide_sz == g_block_size)                {                    bzsize = g_block_size;                    /* calculate checksum and check in */                    hkey = adler32_checksum(block_buf, bzsize);                    uint_2_str(hkey, crc_checksum);                    hash_checkin(g_sb_htable_crc, crc_checksum);                    /* compress block if -z flag is given */                    if (g_bz)                    {                        bzsize = BUF_MAX_SIZE;                        if (Z_OK != zlib_compress_block(block_buf, g_block_size, buf_bz, &bzsize))                        {                            ret = -1;                            goto _FILE_CHUNK_SB_EXIT;                        }                        memcpy(block_buf, buf_bz, bzsize);                    }                    md5(block_buf, bzsize, md5_checksum);                    md5_2_str(md5_checksum);                    if (0 != (ret = dedup_regfile_block_process(block_buf, bzsize, md5_checksum,                         fd_ldata, fd_bdata, pos, block_num, metadata, htable)))                    {                        perror("dedup_regfile_block_process in file_chunk_sb");                        goto _FILE_CHUNK_SB_EXIT;                    }                    slide_sz = 0;                }            }            adler_pre_char = buf[head - 1];        }        /* read expected data from file to full up buf */        bpos = tail - head;        exp_rwsize = BUF_MAX_SIZE - bpos;        adler_pre_char = buf[head - 1];        memmove(buf, buf + head, bpos);    }    /* last chunk */    *last_block_len = ((rwsize + bpos + slide_sz) > 0) ? rwsize + bpos + slide_sz : 0;    if (*last_block_len > 0)    {        memcpy(last_block_buf, block_buf, slide_sz);        memcpy(last_block_buf + slide_sz, buf, rwsize + bpos);    }_FILE_CHUNK_SB_EXIT:    lseek(fd, 0, SEEK_SET);    return ret;}

固定分块

/* * fixed-sized file chunking *固定分块 */static int file_chunk_fsp(int fd, int fd_ldata, int fd_bdata, unsigned int *pos, unsigned int *block_num,     block_id_t **metadata, hashtable *htable, char *last_block_buf, unsigned int *last_block_len){    int ret = 0;    unsigned int rwsize, bzsize;    unsigned char md5_checksum[32 + 1] = {0};    char *buf = NULL, *buf_bz = NULL;//buf_bz 给压缩使用的    buf = (char *)malloc(g_block_size * 2);//4K*2    buf_bz = (char *)malloc(g_block_size * 2);//4K*2    if (buf == NULL || buf_bz == NULL)    {        perror("malloc in file_chunk_fsp");        return errno;    }    while (rwsize = read(fd, buf, g_block_size))//    {         /*if the last block 当最后一块不够一块时*/        if (rwsize != g_block_size)            break;        /* compress block if -z flag is given */        /*        if (g_bz) {            bzsize = g_block_size * 2;            if (Z_OK != zlib_compress_block(buf, rwsize, buf_bz, &bzsize)) {                ret = -1;                goto _FILE_CHUNK_FSP_EXIT;            }            memcpy(buf, buf_bz, bzsize);            rwsize = bzsize;        }*/        /* calculate md5 */        md5(buf, rwsize, md5_checksum);        md5_2_str(md5_checksum);        if (0 != (ret = dedup_regfile_block_process(buf, rwsize, md5_checksum, fd_ldata,             fd_bdata, pos, block_num, metadata, htable)))        {            perror("dedup_regfile_block_process in file_chunk_fsp");            goto _FILE_CHUNK_FSP_EXIT;        }    }    *last_block_len = (rwsize > 0) ? rwsize : 0;    if ((*last_block_len)) memcpy(last_block_buf, buf, *last_block_len);_FILE_CHUNK_FSP_EXIT:    if (buf) free(buf);    return ret;}
0 0
原创粉丝点击