• Grid Block Thread idx = BlockID*Block Size + Thread ID

nvidia-smi watch -n 1 nvidia-smi nsys profile -t cuda,nvtx,osrt -o add_cuda -f true ./add_cuda nsys stats add_cuda.nsys-rep

nsight compute ncu —details-all —nvtx —call-stack —set full ./add_cuda

ncu —nvtx —call-stack —set full -f —export add_cuda.ncu-rep ./add_cuda

roofile模型