cudaLab 2计算两个矩阵对应列的相似度

合集下载

1、下载文档前请自行甄别文档内容的完整性，平台不提供额外的编辑、内容补充、找答案等附加服务。
2、"仅部分预览"的文档,不可在线预览部分如存在完整性等问题,可反馈申请退款(可完整预览的文档不适用该条件!)。
3、如文档侵犯您的权益，请联系客服反馈,我们会尽快为您处理(人工客服工作时间：9:00-18:30)。

// Allocate the host input matrix A float *h_A = (float *)malloc(size);
// Allocate the host input matrix B float *h_B = (float *)malloc(size);
// Allocate the host output matrix C float *h_C = (float *)malloc(size);
// Initialize the host input matrixs for (int i = 0; i < numElements; ++i) {
h_A[i] = 1.1; h_B[i] = 1.2; }
// Allocate the device input matrix A float *d_A = NULL; err = cudaMalloc((void **)&d_A, size);
clock_t start,end; start=clock();
// Error code to check return values for CUDA calls cudaError_t err = cudaSuccess;
// Print the matrix length to be used, and compute its size int numElements = Width*Width; size_t size = numElements * sizeof(float); printf("[matrix multiplication of %d elements]\n", numElements);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to allocate device matrix A (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE); }
// Allocate the device input matrix B float *d_B = NULL; err = cudaMalloc((void **)&d_B, size);
1. 目标
Lab 2:计算两个矩阵对应列的相似度
掌握如何运用共享内存与并行归约方法进行两个矩阵对应列的相似度计算
2. 过程
请结合 PPT 中的内容，完成三种不同情况下的两个矩阵对应列的相似度计算
相似度计算：采用 SAD（绝对值差和的形式）
方法要求：
1）只用全局内存
2）只用共享内存 3）用共享内存同时避免分支发散实验要求：
if (err != cudaSuccess)
{ fprintf(stderr, "Failed to allocate device matrix C (error
cudaGetErrorString(err)); exit(EXIT_FAILURE);
} // printf("Copy input data from the host memory to the CUDA device\n"); err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
// Verify that allocations succeeded if (h_A == NULL || h_B == NULL || h_C == NULL) {
fprintf(stderr, "Failed to allocate host matrixs!\n"); exit(EXIT_FAILURE); }
if (err != cudaSuccess) {
fprintf(stderr, "Failed to free device matrix A (error code %s)!\n", cudaGetErrorString(err));
for (j = 0; j < Width; ++j) {
sum = 0; for (k = 0; k < Width; ++k)
{ a = M[k * Width+ j]; b = N[k * Width+ j]; sum += fabs(a - b);
} P[j] = sum; } }
/** * Host main routine */
fprintf(stderr, "Failed to launch MatrixMulKernel (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE); } // Copy the device result matrix in device memory to the host result matrix in host memory. // printf("Copy output data from the CUDA device to the host memory\n"); err = cudaMemcpy(h_C,d_C,size,cudaMemcpyDeviceToHost); if (err != cudaSuccess) {
int main(int argc,char * argv[]) {
cudaSetDevice(0);
int Width=0; if(argc==1) {
prin); exit(1); } Width=atoi(argv[1]);
int numElements,size,i,size1; float *h_A=NULL,*h_B=NULL,*h_C=NULL,time1; clock_t start,end; start=clock();
for(i=0;i<Width;i++) {
printf("%f ",h_C[i]); } printf("\n"); free(h_A); free(h_B); free(h_C);
return 0; } 全局内存实现代码
#include <stdio.h> #include <time.h> // For the CUDA runtime routines (prefixed with "cuda_") #include <cuda_runtime.h> #include <math.h>
行时间
3）用上述三种方法实现 2 个矩阵（1024*1024）的对应列的相似度计算，比较这三种方
法在运行时间上的差异，同时分析是否与课堂上所讲的知识吻合。
注意：一定要验证你的结果是否正确。另外，如果你实现了 SAD，也考虑用 NCC（归一化互
相关方法）如何实现。
CPU 实现代码
/** * the code on cpu of SAD */
fprintf(stderr, "Failed to copy matrix C from device to host (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Free device global memory err = cudaFree(d_A);
for (i = 0; i < numElements; ++i) {
h_A[i] = 1.1;
h_B[i] = 1.2; }
MatrixSADcpu(h_A, h_B, h_C,Width);
end=clock(); time1=(float)(end-start)/CLOCKS_PER_SEC; printf("执行时间为：%f\n",time1); printf("Done\n");
__syncthreads(); }
stride *= 2)
}
/** * Host main routine */
int main(int argc,char * argv[]) {
cudaSetDevice(0);
int Width=0; if(argc==1) {
printf("Input the width\n"); exit(1); } Width=atoi(argv[1]);
code
%s)!\n",
if (err != cudaSuccess) {
fprintf(stderr, "Failed to copy matrix A from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE); }
if(t<Width*Width) Pd[t]=fabs(Md[t]-Nd[t]);
__syncthreads();
for (unsigned int stride = 1; stride <Width; {
if (t % (2*stride) == 0) Pd[t] += Pd[t+stride];
numElements= Width*Width; size = numElements * sizeof(float); size1=Width * sizeof(float);
printf("[The SAD of two %d by %d matrix]\n",Width,Width);
h_A = (float *)malloc(size); h_B = (float *)malloc(size); h_C = (float *)malloc(size1);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to allocate device matrix B (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE); }
// Allocate the device output matrix C float *d_C = NULL; err = cudaMalloc((void **)&d_C, size);
/** * CUDA Kernel Device code * * */
__global__ void MatrixSADGlobalMemKernel( float *Md, float *Nd, float *Pd, int Width) {
int t=blockIdx.x*blockDim.x+threadIdx.x;
err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to copy matrix B from host to device (error code %s)!\n", cudaGetErrorString(err));
1）用全局内存方法实现 2 个矩阵（32*32）的对应列的相似度计算，M 矩阵的初始值全为 1.1，N 矩阵的初始值全为 1.2。同时用 CPU 代码实现，比较两个代码的运行时
间
2）用共享内存方法实现 2 个矩阵（1024*1024）的对应列的相似度计算，M 矩阵的初
始值全为 1.1，N 矩阵的初始值全为 1.2。同时用 CPU 代码实现，比较两个代码的运
#include <stdio.h> #include <time.h> #include <malloc.h> /**
* Host code * */
void MatrixSADcpu(float *M, float *N, float *P, int Width) {
int j,k; double sum,a,b;
exit(EXIT_FAILURE); }
// Launch the matrix Add CUDA Kernel
MatrixSADGlobalMemKernel<<<Width,Width>>>(d_A, d_B, d_C,Width);
err = cudaGetLastError(); if (err != cudaSuccess) {