利用高速缓存(Cache)的局部性优化矩阵乘法

合集下载

1、下载文档前请自行甄别文档内容的完整性，平台不提供额外的编辑、内容补充、找答案等附加服务。
2、"仅部分预览"的文档,不可在线预览部分如存在完整性等问题,可反馈申请退款(可完整预览的文档不适用该条件!)。
3、如文档侵犯您的权益，请联系客服反馈,我们会尽快为您处理(人工客服工作时间：9:00-18:30)。

int sj, si, sk; for ( sj = 0; sj < n; sj += blocksize )
for ( si = 0; si < n; si += blocksize ) for ( sk = 0; sk < n; sk += blocksize )
-4-
do_block(n, blocksize, si, sj, sk, A, B, C); }
*B, double *C)
{
dgemm (n, blocksize, A+si*n+sk, B+sk*n+sj, C+si*n+sj);
//printf("\n");
//printf("%d %d %d\n", si, sj, sk);
//for(int i = 0; i < n; i++)
//{
}
-2-
分析：计算机在实际计算上述普通矩阵乘法时，所计算矩阵 C 的每一个数据时，都要用到矩阵 A 的某行和矩阵 B 中的某列，而矩阵 A、B 和 C 都是存储在内存中的，又由于 CPU 的速度远远大于访问内存的速度，如果是直接从内存读取和写回计算数据，那么计算效率是非常低下的，由于访问内存会导致时延，CPU 的计算资源被浪费，即计算效率低。为了提高计算速度，引入了 cache 机制，即先把存放在内存中的矩阵 A、B 的元素调入 cache，这样寄存器可以先寻访 cache，访问 cache 的速度要比访问内存的速度快，如果在 cache 中没有所需要的数据时，才需要访问内存。但是，矩阵 A、B 在实际应用中都包含大量的元素，数据量非常分庞大，也即，上述程序中 n 很大，而处理器中的 cache 往往很小，因此不能将整个矩阵全部放入 cache 中。因此需要将这些大的矩阵按照某种方法进行分块，使得分块后的小矩阵可以放入到 cache 中，但是分块又不能随意分，需要有一定的原则去分块，如果分块子矩阵太大，那么子矩阵还是不能全部放入 cache 中，如果分块子矩阵太小，那么为了计算一个大矩阵的数据，需要调入 cache 的子矩阵的次数会增加，因此需要选择合适的分块方法。 2.分块实现矩阵乘法，利用 cache 的局部性，优化程序性能： a) 安装 Linux 系统： b) 查看 Linux 系统 cache 的大小：
} （2）分析普通矩阵乘法访存时高速缓存缺失与矩阵大小 n 和 cache 容量的关系。
-1-
（3）将矩阵乘法分块实现来充分利用高速缓存的局部性，优化程序性能。 #define BLOCKSIZE 32 void do_block (int n, int si, int sj, int sk, double *A, double *B, double *C) {
二、实验环境：
硬件：桌面 PC 软件：linux （GCC 采用“inline”功能去除调用开销）
三、实验内容：
按照下面的实验步骤及说明，完成相关实验并提交实验报告：（1）如下，普通矩阵乘法一般采用 3 重循环完成。
void dgemm (int n, double* A, double* B, double* C) {
一、实验目的与要求：
实验目的： 1.增进对 cache 工作原理以及计算机存储体系的理解； 2.体验程序中访存模式变化是如何影响 cahce 效率进而影响程序性能的过程；
实验要求： 1.自学课本高速缓存章节的相关知识； 2.分析普通矩阵乘法访存时高速缓存缺失与矩阵大小 n 和 cache 容量的关系； 3.将矩阵乘法分块实现来充分利用高速缓存的局部性，优化程序性能； 4.测试分块版本矩阵计算程序在矩阵不同大小（32，64，128，160，480，960，….MAX），
// for(int j = 0; j < n; j++)
// {
//
printf("%lf ", C[si*n+sj]);
// }
// printf("\n");
//}
//printf(ห้องสมุดไป่ตู้\n");
}
inline void dgemm_block (int n, int blocksize, double* A, double* B, double* C) {
if(C==NULL || A == NULL || B == NULL) {
printf("malloc error!"); return 1; }
memset(C, 0, sizeof(double)*N*N); for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
c) 编写普通矩阵乘法运算的代码、利用 cache 局部性分块实现矩阵相乘的代码，如下：
#include <stdio.h> #include <memory.h> #include <time.h> #include <stdlib.h> #include<sys/time.h>
-3-
inline void dgemm (int n, int blocksize, double* A, double* B, double* C) {
int i, j, k; for(i = 0; i < blocksize; i++) {
for(j = 0; j < blocksize; j++) {
double cij = C[i*n+j]; /* cij = C[i][j] */ for(k = 0; k < blocksize; k++ )
-5-
_usec); printf("mean timeuse = %f us \n",Timeuse/N/N/N); /********************计时*****************/ printf("N : %d blocksize : %d\n", N, BLOCKSIZE); printf("%d %lf %lf\n\n", N, C[0], C[N*(N-1) + N-1]);
int BLOCKSIZE = N/sizes[th]; double *A=NULL, *B=NULL, *C=NULL; A = (double*)malloc(sizeof(double)*N*N); B = (double*)malloc(sizeof(double)*N*N); C = (double*)malloc(sizeof(double)*N*N);
for (int i = 0; i < n; ++i) for (int j = 0; j < n; ++j) { double cij = C[i+j*n]; /* cij = C[i][j] */ for(int k = 0; k < n; k++ ) cij += A[i+k*n] * B[k+j*n]; /* cij += A[i][k]*B[k][j] */ C[i+j*n] = cij; /* C[i][j] = cij */ }
free(A); free(B); free(C); } printf("successful!\n"); getchar(); return 0; }
int main2() {
int N = 2048; int sizes[9] = {64, 128, 256, 480，512，960，1024，1536, 1920}; int i, j, th; for (th = 0; th < 9; th++){
//dgemm (N, N, A, B, C); dgemm_block(N, BLOCKSIZE, A, B, C); /********************计时*****************/ gettimeofday(&endTime,NULL); Timeuse = 1000000*(_sec - _sec) + (_usec -
cij += A[i*n+k] * B[k*n + j]; /* cij += A[i][k]*B[k][j] */ C[i*n+j] = cij; /* C[i][j] = cij */ } } }
inline void do_block (int n, int blocksize, int si, int sj, int sk, double *A, double
（5）调整 BLOCKSIZE 大小，分析 BLOCKSIZE 大小对程序性能的影响。
（6）制作相关数据的对比图表，分析原因并提交实验报告。
四、实验步骤与过程：(给出程序分析和算法描述（流程图或文字）、程序核心代码。)
实验步骤： 1.分析普通矩阵乘法访存时高速缓存缺失与矩阵大小 n 和 cache 容量的关系：普通矩阵乘法： void dgemm (int n, double* A, double* B, double* C) {
}
void dgemm_block (int n, double* A, double* B, double* C) {
for ( int sj = 0; sj < n; sj += BLOCKSIZE ) for ( int si = 0; si < n; si += BLOCKSIZE ) for ( int sk = 0; sk < n; sk += BLOCKSIZE ) do_block(n, si, sj, sk, A, B, C);
int main() {
int sizes[9] = {64, 128, 256, 480，512，960，1024，1536, 1920}; int i,j,th; for (th = 0; th < 9; th++){
int BLOCKSIZE = 16; int N = sizes[th];
if(N%BLOCKSIZE!=0) continue; double *A=NULL, *B=NULL, *C=NULL; A = (double*)malloc(sizeof(double)*N*N); B = (double*)malloc(sizeof(double)*N*N); C = (double*)malloc(sizeof(double)*N*N);
} 完成 do_block 子程序的设计和相关性能测试程序的代码。
（ 4 ）测试分块版本矩阵计算程序在矩阵不同大小（ 32 ， 64 ， 128 ， 160 ， 480 ， 960，….MAX），时与未分块版本的性能对比。注：最大值 MAX 使得矩阵尺寸增大到不能在 cache 中完全容纳这三个矩阵时值（计算机 cache 大小可自行查找相关处理器的资料或用相关测试软件）。
for (int i = 0; i < n; ++i) for (int j = 0; j < n; ++j) { double cij = C[i+j*n]; /* cij = C[i][j] */ for(int k = 0; k < n; k++ ) cij += A[i+k*n] * B[k+j*n]; /* cij += A[i][k]*B[k][j] */ C[i+j*n] = cij; /* C[i][j] = cij */ }
时与未分块版本的性能对比。注：最大值 MAX 使得矩阵尺寸增大到不能在 cache 中完全容纳这三个矩阵时值（计算机 cache 大小可自行查找相关处理器的资料或用相关测试软件）
5.调整 BLOCKSIZE 大小，分析 BLOCKSIZE 大小对程序性能的影响； 6.制作相关数据的对比图表，分析原因并在 blackboard 上提交实验报告。
A[i*N + j] = 1; B[i*N + j] = 2; } }
/********************计时*****************/ struct timeval startTime,endTime; float Timeuse; gettimeofday(&startTime,NULL); /********************计时*****************/