当前位置：首页 > news >正文

深圳网站建设首选上榜网络装修网站怎么做的好

news 2026/4/20 4:07:24

深圳网站建设首选上榜网络,装修网站怎么做的好,微信小程序注册方式,喜迎二十大前置准备安装NVIDIA Nsight Compute。安装好后选择使用管理员权限启动下载官方 Demo 代码官方博客Shuffle warp 1. 任务介绍及CPU版本 1.1 任务介绍任务理解#xff1a; 有一个 L x M 的矩阵 M 1 M_1 M1 对其每行取平均值得到 V 1 ∈ R L 1 V_1 \in \mathbb{R}^{…前置准备安装NVIDIA Nsight Compute。安装好后选择使用管理员权限启动下载官方 Demo 代码官方博客Shuffle warp 1. 任务介绍及CPU版本 1.1 任务介绍任务理解有一个 L x M 的矩阵 M 1 M_1 M1 对其每行取平均值得到 V 1 ∈ R L × 1 V_1 \in \mathbb{R}^{L \times 1} V1∈RL×1 的列向量有一个 L x L 的矩阵 M 2 M_2 M2 与 V 1 V_1 V1做矩阵乘法最后得到 V 2 ∈ R L × 1 V_2 \in \mathbb{R}^{L \times 1} V2∈RL×1 的列向量 1.2 CPU版本实现 /**input: input 矩阵维度为(N, L, M) N为Batchoutput: ouput 矩阵维度为(N, L)matrix: matrix 维度为(L, L) */ template typename T void cpu_version1(T *input, T *output, T *matrix, int L, int M, int N){ #pragma omp parallel forfor (int k 0; k N; k){ // repeat the following, N timesstd::vectorT v1(L); // vector length of Lfor (int i 0; i M; i) // compute average vector over M input vectorsfor (int j 0; j L; j)v1[j] input[k*M*Lj*Mi];for (int j 0; j L; j)v1[j] / M;for (int i 0; i L; i) // matrix-vector multiplyfor (int j 0; j L; j)output[i*Nk] matrix[i*Lj]*v1[j];} }1.3 计时逻辑 #include time.h #ifdef __linux__ #define USECPSEC 1000000ULL #include sys/time.h unsigned long long dtime_usec(unsigned long long start){timeval tv;gettimeofday(tv, 0);return ((tv.tv_sec*USECPSEC)tv.tv_usec)-start; } #elif defined(WIN32) #include windows.h double dtime_usec(double start) {LARGE_INTEGER frequency; // ticks per secondLARGE_INTEGER t1; // ticksdouble elapsedTime;// get ticks per secondQueryPerformanceFrequency(frequency);// get current timeQueryPerformanceCounter(t1);// compute the elapsed time in micro-second resolution(毫秒)elapsedTime (t1.QuadPart - start) * 1000000.0 / frequency.QuadPart;return elapsedTime; } #endif1.4 main函数 typedef double ft;int main(){ft *d_input, *h_input, *d_output, *h_outputc, *h_outputg, *d_matrix, *h_matrix;int L my_L; int M my_M; int N my_N;// host allocationsh_input new ft[N*L*M];h_matrix new ft[L*L];h_outputg new ft[N*L];h_outputc new ft[N*L];// data initializationfor (int i 0; i N*L*M; i) h_input[i] (rand()1)1; // 1 or 2for (int i 0; i L*L; i) h_matrix[i] (rand()1)1; // 1 or 2// create result to test for correctnessLARGE_INTEGER st;double dt;QueryPerformanceCounter(st); // 获取起始时间点cpu_version1(h_input, h_outputc, h_matrix, L, M, N);dt dtime_usec(st.QuadPart);std::cout CPU execution time: \t dt / 1000.0f ms std::endl;// device allocationscudaMalloc(d_input, N*L*M*sizeof(ft));cudaMalloc(d_output, N*L*sizeof(ft));cudaMalloc(d_matrix, L*L*sizeof(ft));cudaCheckErrors(cudaMalloc failure);// copy input data from host to devicecudaMemcpy(d_input, h_input, N*L*M*sizeof(ft), cudaMemcpyHostToDevice);cudaMemcpy(d_matrix, h_matrix, L*L*sizeof(ft), cudaMemcpyHostToDevice);cudaMemset(d_output, 0, N*L*sizeof(ft));cudaCheckErrors(cudaMemcpy/Memset failure);// run on device and measure execution timeQueryPerformanceCounter(st); // 获取起始时间点gpu_version11, L(d_input, d_output, d_matrix, L, M, N);cudaCheckErrors(kernel launch failure);cudaDeviceSynchronize();cudaCheckErrors(kernel execution failure);dt dtime_usec(st.QuadPart);cudaMemcpy(h_outputg, d_output, N*L*sizeof(ft), cudaMemcpyDeviceToHost);cudaCheckErrors(cudaMemcpy failure);for (int i 0; i N*L; i) if (h_outputg[i] ! h_outputc[i]) {std::cout Mismatch at i was: h_outputg[i] should be: h_outputc[i] std::endl; return 0;}std::cout Kernel execution time: \t dt / 1000.0f ms std::endl;return 0; }2. GPU version1 2.1 实现 templatetypename T __global__ void gpu_version1(const T * __restrict__ input, T * __restrict__ output, const T * __restrict__ matrix,const int L,const int M, const int N ) {___shared__ T smem[L];int idx threadIdx.x;// 以此处理N个Batchfor(int k 0; k N; i){T v 0;for(int i 0; i M; i){v input[K * M * L M * idx i]}v / M;for(int row 0; row L; row) {smem[idx] v * M[idx * L row];// 对矩阵乘法求和for(int s blockDim.x 1; s 0; s 1){__syncthreads();if (idx s) smem[threadIdx.x] smem[threadIdx.x s];}if (!threadIdx.x) ouput[i*N row] smem[0];}} }const int L 512; // maximum 1024 const int M 512; const int N 1024; // 调用核函数 gpu_version11, L(d_input, d_output, d_matrix, L, M, N);2.2 时间对比 CPU execution time: 2764.96ms Kernel execution time: 1508.34ms2.3 Nsight 分析可以看到这里This kernel grid is too small to fill the available resources on this device, resulting in only 0.0 full waves across all SMs.这里说我们的 grid size 设置得太小了没有充分利用SM。下一版我们增大 Graid size 3. 增加GridSize 3.1 实现分析上一个核函数调用 gpu_version11, L(d_input, d_output, d_matrix, L, M, N);这里grid size是1,这里可以考虑把这个设置为 Batch size 即 N gpu_version1N, L(d_input, d_output, d_matrix, L, M, N);对应得核函数修改, 其实就是把k 替换成了 blockIdx.x template typename T __global__ void gpu_version2(const T* __restrict__ input, T* __restrict__ output, const T* __restrict__ matrix, const int L, const int M, const int N) {// parallelize threadIdx.x over vector length, and blockIdx.x across k (N)__shared__ T smem[my_L];int idx threadIdx.x;int k blockIdx.x;T v1 0;for (int i 0; i M; i)v1 input[k * M * L idx * M i];v1 / M;for (int i 0; i L; i) {__syncthreads();smem[threadIdx.x] v1 * matrix[i * L idx];for (int s blockDim.x 1; s 0; s 1) {__syncthreads();if (threadIdx.x s) smem[threadIdx.x] smem[threadIdx.x s];}if (!threadIdx.x) output[k i * N] smem[0];} }3.2 时间对比 CPU execution time: 3219.04ms Kernel execution timev1: 1508.34ms Kernel execution timev2: 91.4291ms可以看到相对v1 提高了15倍 3.3 Nsight 分析 3.3.1 概览相比上一个版本SM和memory的利用率明显提高了。 3.3.2 Memoy Workload Analysis The memory access pattern for global loads in L1TEX might not be optimal. On average, this kernel accesses 8.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between threads, results in 20.0 sectors per request, or 20.032 639.3 bytes of cache data transfers per request. The optimal thread address pattern for 8.0 byte accesses would result in 8.032 256.0 bytes of cache data transfers per request, to maximize L1TEX cache performance. Check the  Source Counters section for uncoalesced global loads. 在 L1TEX中全局加载的内存访问模式可能不是最优的。平均来说这个kernel 每个线程每次内存请内存会访问8 字节但是当前的地址模式可能由于线程之间的跨度导致每次请求了20个 sectors 每次请求的缓存传输为 20 * 32 649.3 字节。对于8字节的访问最优的线程地址模式使用的缓存传输数据为 8 * 32 256字节为了最大化L1TEX缓存性能检查没有 coalesed 合并访问的全局加载。为什么会导致这么高 ? 可以通过 source页面可以发现高亮的这一句进行了大量的L2 Cache访问操作为什么这一句会导致访问不合并呢是因为每次for循环都隔了 idx * M 个数据导致缓存失效如何解决这个问题呢看下一节

查看全文

http://www.hkea.cn/news/14336482/