当前位置: 首页 > news >正文

接口网站建设网站更改备案信息在哪

接口网站建设,网站更改备案信息在哪,上海企业微信网站制作,外贸网络营销方案RTX3060 FP64测试与猜想 一.小结二.查看FP64的峰值性能三.打满FP64、FP32的利用率,对比差异四.进一步证明pipe_fp64_cycles_active并不是2个fp64 core的metrics RTX3060 FP64测试与猜想 一.小结 RTX3060 compute capability为8.6,每个SM有2个FP64 core。每个cycle可输出2个fp… RTX3060 FP64测试与猜想 一.小结二.查看FP64的峰值性能三.打满FP64、FP32的利用率,对比差异四.进一步证明pipe_fp64_cycles_active并不是2个fp64 core的metrics RTX3060 FP64测试与猜想 一.小结 RTX3060 compute capability为8.6,每个SM有2个FP64 core。每个cycle可输出2个fp64的结果RTX3060 有4个subcore,这2个core怎么给4个sub_core分呢执行FP64 DADD指令时,MIO PQ利用率超20%(FADD指令不存在该现象),且fp64 pipe的利用率最多为84%每个smsp 执行一条DADD warp指令 pipe_fp64_cycles_active 增加16个cycle,4个smsp一起运行一条DADD warp指令仍是16个cycle猜测: smsp按 1DADD/cycle 交替发送给2个FP64 core,一个warp需要16个cycle(32inst/16cycle-2inst/cycle)如果4个sub core同时按这个速度发,则超过了FP64的处理能力(8inst/cycle 2inst/cycle),但pipe_fp64_cycles_active没有增加说明,在发射FP64指令之前会检测资源的可用性,如果不足,则不发射,pipe_fp64_cycles_active也就不会增加也就解释了4个sub core一起执行时,pipe_fp64_cycles_active.max还是16个cycle执行FP64指令时,4个subcore通过MIO共享FP64实际的执行单元 二.查看FP64的峰值性能 tee fp64_peak_sustained.cu-EOF #include cuda_runtime.h #include cuda.h __global__ void fake_kernel(){} int main(int argc,char *argv[]) {fake_kernel1, 1();cudaDeviceSynchronize(); } EOF /usr/local/cuda/bin/nvcc -stdc17 -archsm_86 -lineinfo -o fp64_peak_sustained fp64_peak_sustained.cu \-I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcuda /usr/local/NVIDIA-Nsight-Compute/ncu --metrics \ sm__sass_thread_inst_executed_op_fp64_pred_on.avg.peak_sustained,\ sm__sass_thread_inst_executed_op_fp64_pred_on.sum.peak_sustained ./fp64_peak_sustained 输出 fake_kernel() (1, 1, 1)x(1, 1, 1), Context 1, Stream 7, Device 0, CC 8.6 Section: Command line profiler metrics ---------------------------------------------------------------- ----------- ------------ Metric Name Metric Unit Metric Value ---------------------------------------------------------------- ----------- ------------ sm__sass_thread_inst_executed_op_fp64_pred_on.avg.peak_sustained inst/cycle 2 #每个sm的峰值性能 sm__sass_thread_inst_executed_op_fp64_pred_on.sum.peak_sustained inst/cycle 56 #28个sm ---------------------------------------------------------------- ----------- ------------2 FP64 cores in devices of compute capability 8.6, 8.7 and 8.9问题:这2个core怎么给4个sub_core分呢? 三.打满FP64、FP32的利用率,对比差异 tee fp64_test.cu-EOF #include iostream #include cuda_runtime.h #include iostream #include vector #include stdio.h #include assert.h #include cstdio #include cuda.h__global__ void kernel_add_float(volatile float *input,volatile float *output) {unsigned int tid threadIdx.x blockIdx.x * blockDim.x;float linput[tid];float routput[tid];for(int i0;i256;i){l-r;} input[tid]l; } __global__ void kernel_add_double(volatile double *input,volatile double *output) {unsigned int tid threadIdx.x blockIdx.x * blockDim.x;double leftinput[tid];double rightoutput[tid];for(int i0;i256;i){leftright;} output[tid]left; } EOF/usr/local/cuda/bin/nvcc -stdc17 -dc -lineinfo -archsm_86 -ptx fp64_test.cu -o fp64_test.ptx /usr/local/cuda/bin/nvcc -archsm_86 fp64_test.ptx -cubin -o fp64_test.cubin /usr/local/cuda/bin/nvcc -archsm_86 fp64_test.cubin -fatbin -o fp64_test.fatbin cat fp64_test.ptx /usr/local/cuda/bin/cuobjdump --dump-sass fp64_test.fatbin# 删掉除DADD、FADD以外的指令 cuasm.py fp64_test.cubin fp64_test.cuasm sed /MOV/d -i fp64_test.cuasm sed /S2R/d -i fp64_test.cuasm sed /ULDC/d -i fp64_test.cuasm sed /IMAD/d -i fp64_test.cuasm sed /LDG/d -i fp64_test.cuasm sed /STG/d -i fp64_test.cuasm sed /F2F/d -i fp64_test.cuasmcuasm.py fp64_test.cuasm /usr/local/cuda/bin/nvcc -archsm_86 fp64_test.cubin -fatbin -o fp64_test.fatbin /usr/local/cuda/bin/cuobjdump --dump-sass fp64_test.fatbin /usr/local/cuda/bin/cuobjdump --dump-resource-usage fp64_test.fatbintee fp64_test_main.cpp-EOF #include stdio.h #include string.h #include cuda_runtime.h #include cuda.hint main(int argc,char *argv[]) {CUresult error;CUdevice cuDevice;cuInit(0);int deviceCount 0;error cuDeviceGetCount(deviceCount);error cuDeviceGet(cuDevice, 0);if(error!CUDA_SUCCESS){printf(Error happened in get device!\n);}CUcontext cuContext;error cuCtxCreate(cuContext, 0, cuDevice);if(error!CUDA_SUCCESS){printf(Error happened in create context!\n);}int block_count28*1000;int block_size32*4*4;int thread_sizeblock_count*block_size;int data_sizesizeof(double)*thread_size;double *output_ptrnullptr;double *input_ptrnullptr;int cudaStatus0;cudaStatus cudaMalloc((void**)input_ptr, data_size);cudaStatus cudaMalloc((void**)output_ptr, data_size);void *kernelParams[] {(void*)output_ptr, (void*)input_ptr};CUmodule module;CUfunction double_function;CUfunction float_function;const char* module_file fp64_test.fatbin;const char* double_kernel_name _Z17kernel_add_doublePVdS0_;const char* float_kernel_name _Z16kernel_add_floatPVfS0_;error cuModuleLoad(module, module_file);if(error!CUDA_SUCCESS){printf(Error happened in load moudle %d!\n,error);}error cuModuleGetFunction(double_function, module, double_kernel_name);if(error!CUDA_SUCCESS){printf(get double_function error!\n);}error cuModuleGetFunction(float_function, module, float_kernel_name);if(error!CUDA_SUCCESS){printf(get float_kernel_name error!\n);} cuLaunchKernel(double_function,block_count, 1, 1,block_size, 1, 1,0,0,kernelParams, 0);cuLaunchKernel(float_function,block_count, 1, 1,block_size, 1, 1,0,0,kernelParams, 0);cudaFree(output_ptr);cudaFree(input_ptr);cuModuleUnload(module);cuCtxDestroy(cuContext);return 0; } EOF g fp64_test_main.cpp -o fp64_test_main -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcudart -lcuda /usr/local/NVIDIA-Nsight-Compute/ncu --metrics \ sm__inst_executed.avg.pct_of_peak_sustained_elapsed,\ smsp__inst_issued.sum,\ sm__issue_active.avg.pct_of_peak_sustained_elapsed,\ sm__pipe_fma_cycles_active.avg.pct_of_peak_sustained_elapsed,\ sm__pipe_fmaheavy_cycles_active.avg.pct_of_peak_sustained_elapsed,\ sm__inst_executed_pipe_cbu_pred_on_any.avg.pct_of_peak_sustained_elapsed,\ sm__mio_inst_issued.avg.pct_of_peak_sustained_elapsed,\ sm__mio_pq_read_cycles_active.avg.pct_of_peak_sustained_elapsed,\ sm__mio_pq_write_cycles_active.avg.pct_of_peak_sustained_elapsed,\ sm__mio_pq_write_cycles_active_pipe_lsu.avg.pct_of_peak_sustained_elapsed,\ sm__mio_pq_write_cycles_active_pipe_tex.avg.pct_of_peak_sustained_elapsed,\ sm__mioc_inst_issued.avg.pct_of_peak_sustained_elapsed,\ sm__mio_inst_issued.avg.pct_of_peak_sustained_elapsed,\ sm__pipe_fp64_cycles_active.avg.pct_of_peak_sustained_elapsed ./fp64_test_main输出 kernel_add_double(volatile double *, volatile double *) (28000, 1, 1)x(512, 1, 1), Context 1, Stream 7, Device 0, CC 8.6 Section: Command line profiler metrics ------------------------------------------------------------------------- ----------- ------------ Metric Name Metric Unit Metric Value ------------------------------------------------------------------------- ----------- ------------ sm__inst_executed.avg.pct_of_peak_sustained_elapsed % 1.32 sm__inst_executed_pipe_cbu_pred_on_any.avg.pct_of_peak_sustained_elapsed % 0.02 sm__issue_active.avg.pct_of_peak_sustained_elapsed % 1.32 sm__mio_inst_issued.avg.pct_of_peak_sustained_elapsed % 3.51 sm__mio_pq_read_cycles_active.avg.pct_of_peak_sustained_elapsed % 0 sm__mio_pq_write_cycles_active.avg.pct_of_peak_sustained_elapsed % 21.05 sm__mio_pq_write_cycles_active_pipe_lsu.avg.pct_of_peak_sustained_elapsed % 0 sm__mio_pq_write_cycles_active_pipe_tex.avg.pct_of_peak_sustained_elapsed % 21.05 # of cycles where register operands from the register file werewritten to MIO PQ, for the tex pipe sm__mioc_inst_issued.avg.pct_of_peak_sustained_elapsed % 1.32 sm__pipe_fma_cycles_active.avg.pct_of_peak_sustained_elapsed % 0 sm__pipe_fmaheavy_cycles_active.avg.pct_of_peak_sustained_elapsed % 0 sm__pipe_fp64_cycles_active.avg.pct_of_peak_sustained_elapsed % 84.21 #利用率打不满 smsp__inst_issued.sum inst 115,136,000 #跟fp32相同的指令条数 ------------------------------------------------------------------------- ----------- ------------kernel_add_float(volatile float *, volatile float *) (28000, 1, 1)x(512, 1, 1), Context 1, Stream 7, Device 0, CC 8.6 Section: Command line profiler metrics ------------------------------------------------------------------------- ----------- ------------ Metric Name Metric Unit Metric Value ------------------------------------------------------------------------- ----------- ------------ sm__inst_executed.avg.pct_of_peak_sustained_elapsed % 99.76 sm__inst_executed_pipe_cbu_pred_on_any.avg.pct_of_peak_sustained_elapsed % 1.55 sm__issue_active.avg.pct_of_peak_sustained_elapsed % 99.76 sm__mio_inst_issued.avg.pct_of_peak_sustained_elapsed % 0 sm__mio_pq_read_cycles_active.avg.pct_of_peak_sustained_elapsed % 0 sm__mio_pq_write_cycles_active.avg.pct_of_peak_sustained_elapsed % 0 sm__mio_pq_write_cycles_active_pipe_lsu.avg.pct_of_peak_sustained_elapsed % 0 sm__mio_pq_write_cycles_active_pipe_tex.avg.pct_of_peak_sustained_elapsed % 0 sm__mioc_inst_issued.avg.pct_of_peak_sustained_elapsed % 0.39 sm__pipe_fma_cycles_active.avg.pct_of_peak_sustained_elapsed % 99.37 sm__pipe_fmaheavy_cycles_active.avg.pct_of_peak_sustained_elapsed % 99.37 sm__pipe_fp64_cycles_active.avg.pct_of_peak_sustained_elapsed % 0 smsp__inst_issued.sum inst 115,136,000 ------------------------------------------------------------------------- ----------- ------------*猜测,sm__pipe_fp64_cycles_active并不是那2个FP64 core的metrics,而是smsp里通往fp64 core的接口模块的活动cycle数 *4个subcore里的fp64接口模块,连接到2个fp64 core,并且经过了mio模块.因此,无法打满fp64的利用率 四.进一步证明pipe_fp64_cycles_active并不是2个fp64 core的metrics tee fp64_test.cu-EOF #include iostream #include cuda_runtime.h #include iostream #include vector #include stdio.h #include assert.h #include cstdio #include cuda.h__global__ void kernel_add_double(volatile double *input,volatile double *output) {unsigned int tid threadIdx.x blockIdx.x * blockDim.x;double leftinput[tid];double rightoutput[tid];for(int i0;i1;i){leftright;} output[tid]left; } EOF/usr/local/cuda/bin/nvcc -stdc17 -dc -lineinfo -archsm_86 -ptx fp64_test.cu -o fp64_test.ptx /usr/local/cuda/bin/nvcc -archsm_86 fp64_test.ptx -cubin -o fp64_test.cubin /usr/local/cuda/bin/nvcc -archsm_86 fp64_test.cubin -fatbin -o fp64_test.fatbin cat fp64_test.ptx /usr/local/cuda/bin/cuobjdump --dump-sass fp64_test.fatbincuasm.py fp64_test.cubin fp64_test.cuasm sed /MOV/d -i fp64_test.cuasm sed /S2R/d -i fp64_test.cuasm sed /ULDC/d -i fp64_test.cuasm sed /IMAD/d -i fp64_test.cuasm sed /LDG/d -i fp64_test.cuasm sed /STG/d -i fp64_test.cuasm sed /F2F/d -i fp64_test.cuasmcuasm.py fp64_test.cuasm /usr/local/cuda/bin/nvcc -archsm_86 fp64_test.cubin -fatbin -o fp64_test.fatbin /usr/local/cuda/bin/cuobjdump --dump-sass fp64_test.fatbin /usr/local/cuda/bin/cuobjdump --dump-resource-usage fp64_test.fatbintee fp64_test_main.cpp-EOF #include stdio.h #include string.h #include cuda_runtime.h #include cuda.hint main(int argc,char *argv[]) {CUresult error;CUdevice cuDevice;cuInit(0);int deviceCount 0;error cuDeviceGetCount(deviceCount);error cuDeviceGet(cuDevice, 0);if(error!CUDA_SUCCESS){printf(Error happened in get device!\n);}CUcontext cuContext;error cuCtxCreate(cuContext, 0, cuDevice);if(error!CUDA_SUCCESS){printf(Error happened in create context!\n);}int block_count1;int block_size32*4*4;int thread_sizeblock_count*block_size;int data_sizesizeof(double)*thread_size;double *output_ptrnullptr;double *input_ptrnullptr;int cudaStatus0;cudaStatus cudaMalloc((void**)input_ptr, data_size);cudaStatus cudaMalloc((void**)output_ptr, data_size);void *kernelParams[] {(void*)output_ptr, (void*)input_ptr};CUmodule module;CUfunction double_function;const char* module_file fp64_test.fatbin;const char* double_kernel_name _Z17kernel_add_doublePVdS0_;error cuModuleLoad(module, module_file);if(error!CUDA_SUCCESS){printf(Error happened in load moudle %d!\n,error);}error cuModuleGetFunction(double_function, module, double_kernel_name);if(error!CUDA_SUCCESS){printf(get float_kernel_name error!\n);} cuLaunchKernel(double_function,block_count, 1, 1,8, 1, 1,0,0,kernelParams, 0);cuLaunchKernel(double_function,block_count, 1, 1,16, 1, 1,0,0,kernelParams, 0);cuLaunchKernel(double_function,block_count, 1, 1,32, 1, 1,0,0,kernelParams, 0);cuLaunchKernel(double_function,block_count, 1, 1,32*2, 1, 1,0,0,kernelParams, 0);cuLaunchKernel(double_function,block_count, 1, 1,32*4, 1, 1,0,0,kernelParams, 0);cuLaunchKernel(double_function,block_count, 1, 1,32*5, 1, 1,0,0,kernelParams, 0);cuLaunchKernel(double_function,block_count, 1, 1,32*4*8, 1, 1,0,0,kernelParams, 0);cudaFree(output_ptr);cudaFree(input_ptr);cuModuleUnload(module);cuCtxDestroy(cuContext);return 0; } EOF g fp64_test_main.cpp -o fp64_test_main -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcudart -lcuda/usr/local/NVIDIA-Nsight-Compute/ncu --metrics smsp__pipe_fp64_cycles_active ./fp64_test_main输出 kernel_add_double(volatile double *, volatile double *) (1, 1, 1)x(8, 1, 1), Context 1, Stream 7, Device 0, CC 8.6 Section: Command line profiler metrics --------------------------------- ----------- ------------ Metric Name Metric Unit Metric Value --------------------------------- ----------- ------------ smsp__pipe_fp64_cycles_active.avg cycle 0.14 smsp__pipe_fp64_cycles_active.max cycle 16 smsp__pipe_fp64_cycles_active.min cycle 0 smsp__pipe_fp64_cycles_active.sum cycle 16 --------------------------------- ----------- ------------kernel_add_double(volatile double *, volatile double *) (1, 1, 1)x(16, 1, 1), Context 1, Stream 7, Device 0, CC 8.6 Section: Command line profiler metrics --------------------------------- ----------- ------------ Metric Name Metric Unit Metric Value --------------------------------- ----------- ------------ smsp__pipe_fp64_cycles_active.avg cycle 0.14 smsp__pipe_fp64_cycles_active.max cycle 16 #不足一个warp跟一个warp 的pipe_fp64_cycles_active一样,说明存在无效计算 smsp__pipe_fp64_cycles_active.min cycle 0 smsp__pipe_fp64_cycles_active.sum cycle 16 --------------------------------- ----------- ------------kernel_add_double(volatile double *, volatile double *) (1, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 8.6 Section: Command line profiler metrics --------------------------------- ----------- ------------ Metric Name Metric Unit Metric Value --------------------------------- ----------- ------------ smsp__pipe_fp64_cycles_active.avg cycle 0.14 smsp__pipe_fp64_cycles_active.max cycle 16 smsp__pipe_fp64_cycles_active.min cycle 0 smsp__pipe_fp64_cycles_active.sum cycle 16 --------------------------------- ----------- ------------kernel_add_double(volatile double *, volatile double *) (1, 1, 1)x(64, 1, 1), Context 1, Stream 7, Device 0, CC 8.6 Section: Command line profiler metrics --------------------------------- ----------- ------------ Metric Name Metric Unit Metric Value --------------------------------- ----------- ------------ smsp__pipe_fp64_cycles_active.avg cycle 0.29 smsp__pipe_fp64_cycles_active.max cycle 16 smsp__pipe_fp64_cycles_active.min cycle 0 smsp__pipe_fp64_cycles_active.sum cycle 32 --------------------------------- ----------- ------------kernel_add_double(volatile double *, volatile double *) (1, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 8.6 Section: Command line profiler metrics --------------------------------- ----------- ------------ Metric Name Metric Unit Metric Value --------------------------------- ----------- ------------ smsp__pipe_fp64_cycles_active.avg cycle 0.57 smsp__pipe_fp64_cycles_active.max cycle 16 smsp__pipe_fp64_cycles_active.min cycle 0 smsp__pipe_fp64_cycles_active.sum cycle 64 --------------------------------- ----------- ------------kernel_add_double(volatile double *, volatile double *) (1, 1, 1)x(160, 1, 1), Context 1, Stream 7, Device 0, CC 8.6 Section: Command line profiler metrics --------------------------------- ----------- ------------ Metric Name Metric Unit Metric Value --------------------------------- ----------- ------------ smsp__pipe_fp64_cycles_active.avg cycle 0.71 smsp__pipe_fp64_cycles_active.max cycle 32 smsp__pipe_fp64_cycles_active.min cycle 0 smsp__pipe_fp64_cycles_active.sum cycle 80 --------------------------------- ----------- ------------kernel_add_double(volatile double *, volatile double *) (1, 1, 1)x(1024, 1, 1), Context 1, Stream 7, Device 0, CC 8.6 Section: Command line profiler metrics --------------------------------- ----------- ------------ Metric Name Metric Unit Metric Value --------------------------------- ----------- ------------ smsp__pipe_fp64_cycles_active.avg cycle 4.57 smsp__pipe_fp64_cycles_active.max cycle 128 smsp__pipe_fp64_cycles_active.min cycle 0 smsp__pipe_fp64_cycles_active.sum cycle 512 #每个smsp 执行一个warp的fp64需要16个pipe_fp64_cycles_active --------------------------------- ----------- ------------如果是2个fp64 cores的metrics,不会出现这样的现象
http://www.w-s-a.com/news/34442/

相关文章:

  • 汽车行业做网站网站改版seo建议
  • 建设职业注册中心网站photoshop属于什么软件
  • 公司网站展示有哪些wordpress工单
  • iis新建网站seo是做什么工作的
  • 临沂网站建设厂家做外贸的女生现状
  • 电子商务网站建设实践临沂做网站的
  • 网站职能建设论文做外贸都有哪些网站
  • 网站建设项目需求分析房地产网站源码
  • 网站充值提现公司账务怎么做中国能建设计公司网站
  • 网站信息资源建设包括哪些网站网站做维护
  • 网站性能优化的方法有哪些建设施工合同网站
  • 郑州建设企业网站山西省住房和城乡建设厅网站
  • 做网站的去哪找客户正规制作网站公司
  • 网站代理访问是什么意思外国优秀设计网站
  • 合肥个人建站模板网络技术服务有限公司
  • 做网站什么公司好dw企业网站开发教程
  • 怎么做自己的个人网站宝安网站设计哪家最好
  • 浩博建设集团网站站长网站统计
  • 电商网站开发视频seo排名优化方式方法
  • 宿迁市住房城乡建设局网站wordpress纯图片主题
  • 广州建设网站 公司湖北省建设信息网官网
  • 网站建立时间查询做百度移动网站优化排
  • 网站建设和运行费用qq恢复官方网站
  • 可以免费建立网站吗免费的网络营销方式
  • 天津企业设计网站建设wordpress标题字体大小
  • 上高县城乡规划建设局网站创业项目
  • 钓鱼网站在线下载广州网站设计哪里好
  • 做阿里国际网站会有成效吗wordpress微博同步
  • 西安网站建设云速网络网站运营情况怎么写
  • 免费建网站的网站微信商城分销系统方案