cudaLab 2计算两个矩阵对应列的相似度

  1. 1、下载文档前请自行甄别文档内容的完整性,平台不提供额外的编辑、内容补充、找答案等附加服务。
  2. 2、"仅部分预览"的文档,不可在线预览部分如存在完整性等问题,可反馈申请退款(可完整预览的文档不适用该条件!)。
  3. 3、如文档侵犯您的权益,请联系客服反馈,我们会尽快为您处理(人工客服工作时间:9:00-18:30)。
// Allocate the host input matrix A float *h_A = (float *)malloc(size);
// Allocate the host input matrix B float *h_B = (float *)malloc(size);
// Allocate the host output matrix C float *h_C = (float *)malloc(size);
// Initialize the host input matrixs for (int i = 0; i < numElements; ++i) {
h_A[i] = 1.1; h_B[i] = 1.2; }
// Allocate the device input matrix A float *d_A = NULL; err = cudaMalloc((void **)&d_A, size);
clock_t start,end; start=clock();
// Error code to check return values for CUDA calls cudaError_t err = cudaSuccess;
// Print the matrix length to be used, and compute its size int numElements = Width*Width; size_t size = numElements * sizeof(float); printf("[matrix multiplication of %d elements]\n", numElements);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to allocate device matrix A (error code %s)!\n", cudaGetErrorString(err));
// Allocate the device input matrix B float *d_B = NULL; err = cudaMalloc((void **)&d_B, size);
1. 目标
Lab 2:计算两个矩阵对应列的相似度
2. 过程
请结合 PPT 中的内容,完成三种不同情况下的两个矩阵对应列的相似度计算
相似度计算:采用 SAD(绝对值差和的形式)
1) 只用全局内存
2) 只用共享内存 3) 用共享内存同时避免分支发散 实验要求:
if (err != cudaSuccess)
{ fprintf(stderr, "Failed to allocate device matrix C (error
cudaGetErrorString(err)); exit(EXIT_FAILURE);
} // printf("Copy input data from the host memory to the CUDA device\n"); err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
// Verify that allocations succeeded if (h_A == NULL || h_B == NULL || h_C == NULL) {
fprintf(stderr, "Failed to allocate host matrixs!\n"); exit(EXIT_FAILURE); }
if (err != cudaSuccess) {
fprintf(stderr, "Failed to free device matrix A (error code %s)!\n", cudaGetErrorString(err));
for (j = 0; j < Width; ++j) {
sum = 0; for (k = 0; k < Width; ++k)
{ a = M[k * Width+ j]; b = N[k * Width+ j]; sum += fabs(a - b);
} P[j] = sum; } }
/** * Host main routine */
fprintf(stderr, "Failed to launch MatrixMulKernel (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE); } // Copy the device result matrix in device memory to the host result matrix in host memory. // printf("Copy output data from the CUDA device to the host memory\n"); err = cudaMemcpy(h_C,d_C,size,cudaMemcpyDeviceToHost); if (err != cudaSuccess) {
int main(int argc,char * argv[]) {
int Width=0; if(argc==1) {
prin); exit(1); } Width=atoi(argv[1]);
int numElements,size,i,size1; float *h_A=NULL,*h_B=NULL,*h_C=NULL,time1; clock_t start,end; start=clock();
for(i=0;i<Width;i++) {
printf("%f ",h_C[i]); } printf("\n"); free(h_A); free(h_B); free(h_C);
return 0; } 全局内存实现代码
#include <stdio.h> #include <time.h> // For the CUDA runtime routines (prefixed with "cuda_") #include <cuda_runtime.h> #include <math.h>
3)用上述三种方法实现 2 个矩阵(1024*1024)的对应列的相似度计算,比较这三种方
注意:一定要验证你的结果是否正确。另外,如果你实现了 SAD,也考虑用 NCC(归一化互
CPU 实现代码
/** * the code on cpu of SAD */
fprintf(stderr, "Failed to copy matrix C from device to host (error code %s)!\n", cudaGetErrorString(err));
// Free device global memory err = cudaFree(d_A);
for (i = 0; i < numElements; ++i) {
h_A[i] = 1.1;
h_B[i] = 1.2; }
MatrixSADcpu(h_A, h_B, h_C,Width);
end=clock(); time1=(float)(end-start)/CLOCKS_PER_SEC; printf("执行时间为:%f\n",time1); printf("Done\n");
__syncthreads(); }
stride *= 2)
/** * Host main routine */
int main(int argc,char * argv[]) {
int Width=0; if(argc==1) {
printf("Input the width\n"); exit(1); } Width=atoi(argv[1]);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to copy matrix A from host to device (error code %s)!\n", cudaGetErrorString(err));
if(t<Width*Width) Pd[t]=fabs(Md[t]-Nd[t]);
for (unsigned int stride = 1; stride <Width; {
if (t % (2*stride) == 0) Pd[t] += Pd[t+stride];
numElements= Width*Width; size = numElements * sizeof(float); size1=Width * sizeof(float);
printf("[The SAD of two %d by %d matrix]\n",Width,Width);
h_A = (float *)malloc(size); h_B = (float *)malloc(size); h_C = (float *)malloc(size1);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to allocate device matrix B (error code %s)!\n", cudaGetErrorString(err));
// Allocate the device output matrix C float *d_C = NULL; err = cudaMalloc((void **)&d_C, size);
/** * CUDA Kernel Device code * * */
__global__ void MatrixSADGlobalMemKernel( float *Md, float *Nd, float *Pd, int Width) {
int t=blockIdx.x*blockDim.x+threadIdx.x;
err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to copy matrix B from host to device (error code %s)!\n", cudaGetErrorString(err));
1) 用全局内存方法实现 2 个矩阵(32*32)的对应列的相似度计算,M 矩阵的初始值 全为 1.1,N 矩阵的初始值全为 1.2。同时用 CPU 代码实现,比较两个代码的运行时

2) 用共享内存方法实现 2 个矩阵(1024*1024)的对应列的相似度计算,M 矩阵的初
始值全为 1.1,N 矩阵的初始值全为 1.2。同时用 CPU 代码实现,比较两个代码的运
#include <stdio.h> #include <time.h> #include <malloc.h> /**
* Host code * */
void MatrixSADcpu(float *M, float *N, float *P, int Width) {
int j,k; double sum,a,b;
// Launch the matrix Add CUDA Kernel
MatrixSADGlobalMemKernel<<<Width,Width>>>(d_A, d_B, d_C,Width);
err = cudaGetLastError(); if (err != cudaSuccess) {