csapp-lab4 cachelab

来源：互联网发布：淘宝店铺修改退货地址编辑：程序博客网时间：2024/06/08 17:22

Cache-Lab: Understanding Cache Memories

该lab是为了帮助理解缓存对于c程序的性能影响，包括两部分：第一部分是写一个模拟缓存的c程序，第二部分是对于一个转置矩阵的函数进行优化来达到尽可能小的miss数量。

官网下载tar文件并解压tar xvf cachelab-handout.tar得到需要编辑的两个文件csim.c和trans.c以及其他文件。

Part A

首先需要定义缓存的数据结构，这里是组相连缓存，所以定义缓存行Cache_line，组成缓存组Cache_set，再组成缓存Cache，具体定义见下面的代码。

    #define ADDRESS_SIZE 64    typedef struct {        char valid_bit;        unsigned long tag;        int LRU_count;    } Cache_line;    typedef struct {        Cache_line* lines;    } Cache_set;    typedef struct {        int S;        int E;        Cache_set* sets;    } Cache;

在考虑解析命令行选项，使用getopt库函数，具体用法可以参考这里。下面的主函数见文知意，后面的init_cache(s, E, b, &cache)和 cacheSimulator(s, E, b, file, isVerbose, &cache)分别代表初始化缓存数据结构（使用malloc分配内存）和缓存模拟。

int main(int argc, char *argv[]) {    int s, E, b;    char file[100];                                             /* 存储打开的文件名 */    int isVerbose = 0;    Cache cache;    hit_count = miss_count = eviction_count = 0;    int ch;    while ((ch = getopt(argc, argv, "vs:E:b:t:")) != -1) {        switch (ch) {            case 'v':                isVerbose = 1;                break;            case 's':                s = atoi(optarg);                break;            case 'E':                E = atoi(optarg);                break;            case 'b':                b = atoi(optarg);                break;            case 't':                strcpy(file, optarg);                break;            default:                break;        }    }    init_cache(s, E, b, &cache);                                    cacheSimulator(s, E, b, file, isVerbose, &cache);    printSummary(hit_count, miss_count, eviction_count);    return 0;}/*    s -- set的位数    E -- 行的数目    b -- block的位数 */void init_cache(int s, int E, int b, Cache* cache) {    cache->S = 2 << s;    cache->E = E;    cache->sets = (Cache_set*) malloc(cache->S * sizeof(Cache_set));    int i, j;    for (i = 0; i < cache->S; i++) {        cache->sets[i].lines = (Cache_line*) malloc(E * sizeof(Cache_line));        for (j = 0; j < cache->E; j++) {            cache->sets[i].lines[j].valid_bit = 0;            cache->sets[i].lines[j].LRU_count = 0;        }    }    return;}/*    file -- 打开的文件名    isVerbose -- -v是否显示详情（便于debug）    pCache -- cache指针 */void cacheSimulator(int s, int E, int b, char* file, int isVerbose, Cache* pCache) {    FILE *pFile;                        /* pointer to FILE object */    pFile = fopen(file, "r");    char access_type;                   /* L-load S-store M-modify */    unsigned long address;              /* 64-bit hexa memory address */    int size;                           /* # of bytes accessed by operation */    int tag_move_bits = b + s;    while (fscanf(pFile, " %c %lx,%d", &access_type, &address, &size) > 0) {        if (access_type == 'I') {            continue;        } else {            // 计算标识tag和组号set索引            int tag = address >> tag_move_bits;            int set_index = (address >> b) & ((1 << s) - 1);            // 是否显示详细的hit，miss，evict情况            if (isVerbose == 1) {                printf("%c %lx,%d ", access_type, address, size);            }            if (access_type == 'S') {                store(pCache, set_index, tag, isVerbose);            }            if (access_type == 'M') {                modify(pCache, set_index, tag, isVerbose);            }            if (access_type == 'L') {                load(pCache, set_index, tag, isVerbose);            }            if (isVerbose == 1) {                printf("\n");            }        }    }    fclose(pFile);    return;}

在函数cacheSimulator中主要是根据不同的操作模式来调用不同的函数，核心函数是load(pCache, set_index, tag, isVerbose)，load函数先调用get_hitIndex函数来获取内存地址是否在缓存命中，根据命中还是miss(分为两种，cold miss和eviction)采取不同的处理方式。省略的代码如下，详细代码见这里。

    void load(Cache *cache, int set_index, int tag, int isVerbose) {    // 是否命中    int hitIndex = get_hitIndex(cache, set_index, tag);    if (hitIndex == -1) {               /* miss the cache */        miss_count++;        ...        int emptyIndex = get_emptyIndex(cache, set_index, tag);             if (emptyIndex == -1) {         /* full, eviction 已满 */            eviction_count++;            ...        } else {                                /* 未满，找到空行插入，顺序扫描 */            ...    } else {                            /* hit the cache 命中*/        hit_count++;        ...    }}

所以接下里啊最重要的就是怎样根据LRU策略来对不同的命中或miss情况进行处理，具体的算法如下；

命中时，被访问行的计数器LRU_count置0，比其低的计数器加1，其余不变
未命中且该组未满时，新行计数器置为0，其余全加1
未命中且该组已满时，计数器为（cache->E - 1）的那行主存块被淘汰，新行计数器置0，其余加1

Part B

PartB部分是针对给定的缓存对不同维度的矩阵进行转置，目标是减小转置函数的miss数目。该题有三个维度：

32 * 32
64 * 64
67 * 61

具体代码放在下面，1和3比较简单，64 * 64矩阵由于一行的大小超过了缓存的大小，所以需要再一次进行分块，详细代码和注释如下，不过这里我也没能够拿到满分，以后有机会再更新这个吧。

    void transpose_submit(int M, int N, int A[N][M], int B[M][N]){    int i, j, ii, jj, tmp, tmp1, tmp2, tmp3;    int bsize = 8;    // s=5 -> S=32 sets b=5 -> B=32B=8 int    // 32*32按照8*8分组块，一行4块，正好可以填满cache    // 这样既充分利用了cache行的8个字节的block也充分利用了工作集    if (M == 32) {                                  /* block multiply, fit memory to solve conflict miss */        for (ii = 0; ii < N; ii += bsize) {            for (jj = 0; jj < M; jj += bsize) {                for (i = ii; i < ii + bsize; i++) {                    for (j = jj; j < jj + bsize; j++) {                        if (i != j) {                            B[j][i] = A[i][j];                        } else {                            tmp = A[i][j];                        }                    }                    if (ii == jj) {                 /* decrease eviction along the diagonal */                        B[i][i] = tmp;                    }                }             }        }    } else if (M == 64) {                           /* choose proper bsize */        for (ii = 0; ii < N; ii += bsize) {            for (jj = 0; jj < M; jj += bsize) {                // 左上角                j = jj;                for (i = ii; i < ii + 4; i++) {     /* 这里引入4个局部变量来存储4*4小块的值 */                    tmp = A[i][j];                    tmp1 = A[i][j + 1];                    tmp2 = A[i][j + 2];                    tmp3 = A[i][j + 3];                    B[j][i] = tmp;                    B[j + 1][i] = tmp1;                    B[j + 2][i] = tmp2;                    B[j + 3][i] = tmp3;                }                // 右上角                j = jj + 4;                for (i = ii; i < ii + 4; i++) {                    tmp = A[i][j];                    tmp1 = A[i][j + 1];                    tmp2 = A[i][j + 2];                    tmp3 = A[i][j + 3];                    B[j][i] = tmp;                    B[j + 1][i] = tmp1;                    B[j + 2][i] = tmp2;                    B[j + 3][i] = tmp3;                }                // 右下角                for (i = ii + 4; i < ii + 8; i++) {                    tmp = A[i][j];                    tmp1 = A[i][j + 1];                    tmp2 = A[i][j + 2];                    tmp3 = A[i][j + 3];                    B[j][i] = tmp;                    B[j + 1][i] = tmp1;                    B[j + 2][i] = tmp2;                    B[j + 3][i] = tmp3;                }                // 左下角放在最后也是为了减少冲突miss                j = jj;                for (i = ii + 4; i < ii + 8; i++) {                    tmp = A[i][j];                    tmp1 = A[i][j + 1];                    tmp2 = A[i][j + 2];                    tmp3 = A[i][j + 3];                    B[j][i] = tmp;                    B[j + 1][i] = tmp1;                    B[j + 2][i] = tmp2;                    B[j + 3][i] = tmp3;                }            }        }    } else {                                        /* 这里选择合适的块大小，满足miss的要求即可 */        bsize = 16;        for (ii = 0; ii < N; ii += bsize) {            for (jj = 0; jj < M; jj += bsize) {                j = 0;                for (i = ii; (i < ii + bsize) && (i < N); i++) {                    for (j = jj; (j < jj + bsize) && (j < M); j++) {                        if (i != j) {                            B[j][i] = A[i][j];                        } else {                            tmp = A[i][j];                        }                    }                    if (ii == jj) {                        B[i][i] = tmp;                    }                }             }        }    }}

阅读全文

0 0