{"id":321,"date":"2024-10-23T14:23:45","date_gmt":"2024-10-23T06:23:45","guid":{"rendered":"https:\/\/www.qibinliang.com\/?p=321"},"modified":"2024-10-23T14:23:45","modified_gmt":"2024-10-23T06:23:45","slug":"cuda%e7%bc%96%e7%a8%8b%e5%ad%a6%e4%b9%a0%e7%ac%94%e8%ae%b0%ef%bc%88%e4%ba%8c%ef%bc%89","status":"publish","type":"post","link":"https:\/\/www.qibinliang.com\/index.php\/2024\/10\/23\/cuda%e7%bc%96%e7%a8%8b%e5%ad%a6%e4%b9%a0%e7%ac%94%e8%ae%b0%ef%bc%88%e4%ba%8c%ef%bc%89\/","title":{"rendered":"Cuda\u7f16\u7a0b\u5b66\u4e60\u7b14\u8bb0\uff08\u4e8c\uff09"},"content":{"rendered":"\n<h2 class=\"wp-block-heading\">\u8bfe\u7a0b\u5185\u5bb9\uff08CUDA Shared Memory\uff09<\/h2>\n\n\n\n<p>\u6b64\u6b21\u8bb2\u5ea7\u662fOLCF CUDA\u57f9\u8bad\u7cfb\u5217\u7684\u7b2c\u4e8c\u90e8\u5206\uff0c\u91cd\u70b9\u8bb2\u89e3\u4e86\u5982\u4f55\u5728CUDA\u7f16\u7a0b\u4e2d\u4f7f\u7528\u5171\u4eab\u5185\u5b58\uff08Shared Memory\uff09\u6765\u4f18\u5316\u6027\u80fd\u3002\u4e3b\u8bb2\u4ebaTom Papatheodore\u8be6\u7ec6\u8bf4\u660e\u4e86\u5171\u4eab\u5185\u5b58\u7684\u7279\u70b9\u3001\u4f7f\u7528\u65b9\u6cd5\u53ca\u5176\u4e0e\u5168\u5c40\u5185\u5b58\u7684\u533a\u522b\uff0c\u5e76\u901a\u8fc7\u5177\u4f53\u793a\u4f8b\u5c55\u793a\u5982\u4f55\u5b9e\u73b0\u6570\u636e\u7f13\u5b58\u548c\u7ebf\u7a0b\u540c\u6b65\u3002\u6700\u540e\uff0c\u8bfe\u7a0b\u901a\u8fc7\u6a21\u677f\u64cd\u4f5c\uff08Stencil operation\uff09\u5c55\u793a\u4e86\u5728CUDA\u7f16\u7a0b\u4e2d\u5171\u4eab\u5185\u5b58\u7684\u4f7f\u7528\u65b9\u5f0f\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">\u5171\u4eab\u5185\u5b58\uff08Shared Memory\uff09<\/h3>\n\n\n\n<p>\u5728\u4e0a\u4e00\u7ae0\u8282<a href=\"https:\/\/www.qibinliang.com\/index.php\/2024\/10\/17\/cuda%e7%bc%96%e7%a8%8b%e5%ad%a6%e4%b9%a0%e7%ac%94%e8%ae%b0%ef%bc%88%e4%b8%80%ef%bc%89\/\" data-type=\"post\" data-id=\"218\"><mark style=\"background-color:rgba(0, 0, 0, 0)\" class=\"has-inline-color has-vivid-cyan-blue-color\"><strong><em>Cuda\u7f16\u7a0b\u5b66\u4e60\u7b14\u8bb0\uff08\u4e00\uff09<\/em><\/strong><\/mark><\/a>\u4e2d\u8bb2\u5230\uff0c\u6211\u4eec\u9700\u8981\u4e3a\u7a0b\u5e8f\u4e2d\u7684\u6570\u636e\u5206\u914d\u8bbe\u5907\u5185\u5b58\uff0c\u5728\u51c6\u5907\u9636\u6bb5\u6211\u4eec\u9700\u8981\u5728CPU\u548cGPU\u4e0a\u521d\u59cb\u5316\u6211\u4eec\u9700\u8981\u7684\u5185\u5b58\u7a7a\u95f4\u5927\u5c0f\uff0c\u7136\u540e\u628aCPU\u7684\u6570\u636e\u590d\u5236\u5230GPU\u5185\u5b58\u4e0a\uff0c\u8fd9\u91cc\u63d0\u5230\u7684GPU\u5185\u5b58\u662f\u5168\u5c40\u5185\u5b58\uff08Global Memory\uff09\u3002\u4f46\u662f\uff0c\u5728\u672c\u7ae0\u8282\u4e2d\u6211\u4eec\u6240\u63d0\u5230\u7684\u5185\u5b58\u5219\u662f\u5171\u4eab\u5185\u5b58\uff08Shared Memory\uff09\u3002<\/p>\n\n\n\n<p>\u5171\u4eab\u5185\u5b58\u6709\u4ee5\u4e0b\u7279\u6027\u533a\u522b\u4e8e\u5168\u5c40\u5185\u5b58\uff1a<\/p>\n\n\n\n<ol class=\"wp-block-list\">\n<li>\u5168\u5c40\u5185\u5b58\u4e00\u822c\u6307\u7684\u662f\u786c\u4ef6\u4e0a\u7684\u5185\u5b58\uff0c\u6362\u53e5\u8bdd\u8bb2\u5c31\u662f\u4ed6\u7684\u5b9e\u73b0\u662f\u5728GPU\u7684DRAM\uff0c\u800c\u4e0d\u662f\u5728GPU\u672c\u8eab\uff08on-die\uff09\uff0c\u5b83\u662f\u6709\u4e00\u5b9a\u6570\u91cf\u7684\u4e0eGPU\u94fe\u63a5\u9ad8\u901f\u5185\u5b58\u6240\u7ec4\u6210\u3002\u76f8\u53cd\uff0c\u5171\u4eab\u5185\u5b58\u6307\u7684\u662f\u771f\u6b63\u5728GPU\u5185\u90e8\u5b9e\u73b0\u7684\u5185\u5b58\u4e5f\u53eb\u505aon-chip memory\u3002<\/li>\n\n\n\n<li>\u5171\u4eab\u5185\u5b58\u6709\u7528\u66f4\u5feb\u7684\u901f\u5ea6\u4ee5\u53ca\u66f4\u9ad8\u7684\u5e26\u5bbd\uff0c\u7c97\u7565\u4f30\u7b97\u6bd4\u5168\u5c40\u5185\u5b58\u5927\u6982\u5feb5\u500d\u3002<\/li>\n\n\n\n<li>\u5171\u4eab\u5185\u5b58\u662f\u7531\u7528\u6237\u81ea\u884c\u7ba1\u7406\u7684\u3002<\/li>\n\n\n\n<li>\u5171\u4eab\u5185\u5b58\u4f7f\u7528<code>__shared__<\/code>\u5173\u952e\u5b57\u533a\u5206\u3002<\/li>\n\n\n\n<li>\u5171\u4eab\u5185\u5b58\u662f\u5355\u4e2a\u533a\u5757\uff08block\uff09\u7684\u903b\u8f91\u8d44\u6e90\uff0c\u8fd9\u610f\u5473\u7740\u6bcf\u4e2a\u533a\u5757\u90fd\u53ef\u4ee5\u7ba1\u7406\u4e00\u4e2a\u53ea\u5c5e\u4e8e\u4ed6\u81ea\u5df1\u7684\u5171\u4eab\u5185\u5b58\uff0c\u8fd9\u79cd\u8d44\u6e90\u662f\u53ef\u4ee5\u88ab\u533a\u5757\u5185\u7684\u7ebf\u7a0b\u6240\u5171\u4eab\u7684\uff0c\u4f46\u5374\u65e0\u6cd5\u88ab\u5176\u4ed6\u533a\u5757\u4ee5\u53ca\u5176\u4ed6\u533a\u5757\u7684\u5185\u5b58\u8bbf\u95ee\u3002<\/li>\n<\/ol>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><div class='fancybox-wrapper lazyload-container-unload' data-fancybox='post-images' href='https:\/\/www.qibinliang.com\/wp-content\/uploads\/2024\/10\/image-3-1024x565.png'><img class=\"lazyload lazyload-style-1\" src=\"data:image\/svg+xml;base64,PCEtLUFyZ29uTG9hZGluZy0tPgo8c3ZnIHdpZHRoPSIxIiBoZWlnaHQ9IjEiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgc3Ryb2tlPSIjZmZmZmZmMDAiPjxnPjwvZz4KPC9zdmc+\"  loading=\"lazy\" decoding=\"async\" width=\"1024\" height=\"565\" data-original=\"https:\/\/www.qibinliang.com\/wp-content\/uploads\/2024\/10\/image-3-1024x565.png\" src=\"data:image\/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsQAAA7EAZUrDhsAAAANSURBVBhXYzh8+PB\/AAffA0nNPuCLAAAAAElFTkSuQmCC\" alt=\"\" class=\"wp-image-327\" style=\"width:652px;height:auto\"  sizes=\"auto, (max-width: 1024px) 100vw, 1024px\" \/><\/div><figcaption class=\"wp-element-caption\">\u56fe1. \u5171\u4eab\u5185\u5b58\u7684\u7279\u6027<\/figcaption><\/figure>\n<\/div>\n\n\n<h3 class=\"wp-block-heading\">\u5b9e\u73b0\u5171\u4eab\u5185\u5b58<\/h3>\n\n\n\n<p>\u89c6\u9891\u4e2dTom\u901a\u8fc7\u4e00\u4e2a\u6a21\u677f\u64cd\u4f5c\uff08Stencil operation\uff09\u7684\u4f8b\u5b50\u6765\u8be6\u7ec6\u89e3\u91ca\u4e86\u5171\u4eab\u5185\u5b58\u7684\u7528\u6cd5\uff0c\u6a21\u677f\u64cd\u4f5c\u6709\u70b9\u7c7b\u4f3c\u4e8e\u795e\u7ecf\u7f51\u7edc\u7684\u5377\u79ef\u64cd\u4f5c\uff0c\u4e3b\u8981\u662f\u901a\u8fc7\u7279\u5b9a\u7684\u6a21\u5f0f\u5bf9\u4e00\u7cfb\u5217\u7684\u6570\u503c\u6570\u636e\u8fdb\u884c\u5904\u7406\uff0c\u8be6\u7ec6\u53ef\u89c1<a href=\"https:\/\/en.wikipedia.org\/wiki\/Iterative_Stencil_Loops\">\u7ef4\u57fa\u767e\u79d1<\/a>\u89e3\u91ca\u3002\u56fe2\uff0cTom\u5728\u89c6\u9891\u4e2d\u89e3\u91ca\u9053\uff0c\u8fd9\u4e2astencil kernel\u5c31\u662f\u4e00\u4e2a\u6ed1\u52a8\u7a97\u53e3\uff0c\u5bf9\u4e8e\u6bcf\u4e00\u4e2a\u5143\u7d20\u8ba1\u7b97\u5305\u62ec\u5de6\u53f3\u4e09\u4e2a\u5143\u7d20\u5728\u5185\u5171\u4e03\u4e2a\u5143\u7d20\u4e4b\u548c\u3002<\/p>\n\n\n\n<p>\u4f7f\u7528CUDA\u5e76\u884c\u8ba1\u7b97\u53ef\u4ee5\u5c06\u6570\u7ec4\u5206\u6210\u591a\u4e2ablock\u5e76\u884c\u5730\u8ba1\u7b97\uff0c\u5728\u8fd9\u91cc\u4f7f\u7528\u5171\u4eab\u5185\u5b58\u5c31\u53ef\u4ee5\u8ba9\u4e00\u4e2ablock\u5185\u7684\u7ebf\u7a0b\u66f4\u5feb\u5730\u8bfb\u53d6\u5230\u6240\u9700\u8981\u7684\u6570\u636e\u3002\u6211\u4eec\u53ef\u4ee5\u5206\u914d\u4e00\u4e2a<code>blockDim+6<\/code>\u5927\u5c0f\u7684\u5171\u4eab\u5185\u5b58\u7a7a\u95f4\u7528\u6765\u5b58\u50a8\u6ed1\u52a8\u7a97\u53e3\u6240\u9700\u8981\u7684\u6570\u636e\u3002\u8be6\u7ec6\u4ee3\u7801\u4f1a\u5728\u8bfe\u540e\u7ec3\u4e60\u4e2d\u5c55\u793a\u3002<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><div class='fancybox-wrapper lazyload-container-unload' data-fancybox='post-images' href='https:\/\/www.qibinliang.com\/wp-content\/uploads\/2024\/10\/image-4-1024x582.png'><img class=\"lazyload lazyload-style-1\" src=\"data:image\/svg+xml;base64,PCEtLUFyZ29uTG9hZGluZy0tPgo8c3ZnIHdpZHRoPSIxIiBoZWlnaHQ9IjEiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgc3Ryb2tlPSIjZmZmZmZmMDAiPjxnPjwvZz4KPC9zdmc+\"  loading=\"lazy\" decoding=\"async\" width=\"1024\" height=\"582\" data-original=\"https:\/\/www.qibinliang.com\/wp-content\/uploads\/2024\/10\/image-4-1024x582.png\" src=\"data:image\/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsQAAA7EAZUrDhsAAAANSURBVBhXYzh8+PB\/AAffA0nNPuCLAAAAAElFTkSuQmCC\" alt=\"\" class=\"wp-image-330\" style=\"width:667px;height:auto\"  sizes=\"auto, (max-width: 1024px) 100vw, 1024px\" \/><\/div><figcaption class=\"wp-element-caption\">\u56fe2. Stencil\u64cd\u4f5c<\/figcaption><\/figure>\n<\/div>\n\n\n<h3 class=\"wp-block-heading\">\u7ebf\u7a0b\u540c\u6b65\u64cd\u4f5c\uff08__syncthreads\uff09<\/h3>\n\n\n\n<p>\u5728\u4e0a\u9762\u7684\u4f8b\u5b50\u4e2d\u5f80\u5171\u4eab\u5185\u5b58\u4e2d\u586b\u5165\u6570\u636e\u8fd9\u4e00\u6b65\u64cd\u4f5c\u662f\u5e76\u884c\u7684\uff0c\u8fd9\u4f1a\u9020\u6210\u4e00\u4e2a\u95ee\u9898\uff0c\u5f53block\u5185\u7b2ci\u4e2a\u7ebf\u7a0b\u9700\u8981\u505astencil\u64cd\u4f5c\u7684\u65f6\u5019\u4ed6\u9700\u8981\u8bfb\u53d6\u5171\u4eab\u5185\u5b58\u4e2d[i-3, i+3]\u51717\u4e2a\u5143\u7d20\uff0c\u4f46\u662f\u6211\u4eec\u65e0\u6cd5\u4fdd\u8bc1\u5171\u4eab\u5185\u5b58\u4e2d\u5176\u4ed6\u5143\u7d20\u5df2\u7ecf\u88ab\u586b\u5165\uff0c\u56e0\u6b64\u6211\u4eec\u5728\u8fd9\u91cc\u8981\u4f7f\u7528\u4e00\u4e2a\u540c\u6b65\u64cd\u4f5c\uff0c\u4fdd\u8bc1\u5171\u4eab\u5185\u5b58\u7684\u6570\u7ec4\u586b\u5165\u64cd\u4f5c\u5df2\u7ecf\u88ab\u5b8c\u6210\u3002<\/p>\n\n\n\n<p>\u8fd9\u91cc\u6211\u4eec\u53ef\u4ee5\u4f7f\u7528<code>__syncthreads()<\/code>\u51fd\u6570\uff0c\u8fd9\u4e2a\u51fd\u6570\u4f1a\u540c\u6b65\u5757\u5185\u7684\u6240\u6709\u7ebf\u7a0b\uff0c\u5f53\u4e14\u4ec5\u5f53\u6240\u6709\u7ebf\u7a0b\u6267\u884c\u5230\u8fd9\u4e2a\u5c4f\u969c\uff08barrier\uff09\uff0c\u7ebf\u7a0b\u624d\u4f1a\u88ab\u5141\u8bb8\u6267\u884c\u540e\u7eed\u7684\u64cd\u4f5c\u3002<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><div class='fancybox-wrapper lazyload-container-unload' data-fancybox='post-images' href='https:\/\/www.qibinliang.com\/wp-content\/uploads\/2024\/10\/image-5-1024x572.png'><img class=\"lazyload lazyload-style-1\" src=\"data:image\/svg+xml;base64,PCEtLUFyZ29uTG9hZGluZy0tPgo8c3ZnIHdpZHRoPSIxIiBoZWlnaHQ9IjEiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgc3Ryb2tlPSIjZmZmZmZmMDAiPjxnPjwvZz4KPC9zdmc+\"  loading=\"lazy\" decoding=\"async\" width=\"1024\" height=\"572\" data-original=\"https:\/\/www.qibinliang.com\/wp-content\/uploads\/2024\/10\/image-5-1024x572.png\" src=\"data:image\/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsQAAA7EAZUrDhsAAAANSURBVBhXYzh8+PB\/AAffA0nNPuCLAAAAAElFTkSuQmCC\" alt=\"\" class=\"wp-image-333\" style=\"width:668px;height:auto\"  sizes=\"auto, (max-width: 1024px) 100vw, 1024px\" \/><\/div><figcaption class=\"wp-element-caption\">\u56fe3. \u540c\u6b65\u64cd\u4f5c<\/figcaption><\/figure>\n<\/div>\n\n\n<h3 class=\"wp-block-heading\">\u534f\u4f5c\u7ec4\uff08Cooperative Groups\uff09<\/h3>\n\n\n\n<p>Tom\u5728\u8fd9\u8282\u8bfe\u4e2d\u4e5f\u8bb2\u5230\u5b9e\u9645CUDA 9\u4e2d\u4e5f\u6dfb\u52a0\u4e86\u4e0d\u5c11\u7684\u7279\u6027\uff0c\u4f8b\u5982\u534f\u4f5c\u7ec4\uff0c\u8fd9\u53ef\u4ee5\u8ba9\u6211\u4eec\u66f4\u52a0\u7075\u6d3b\u5730\u53bb\u540c\u6b65\u7ebf\u7a0b\u6216\u8005\u5efa\u7acb\u7ebf\u7a0b\u95f4\u7684\u901a\u4fe1\uff0c\u503c\u5f97\u6ce8\u610f\u7684\u662f\u8fd9\u91cc\u7684\u7ebf\u7a0b\u4e0d\u518d\u662f\u4e00\u4e2a\u7ebf\u7a0b\u5757\uff08block\uff09\u5185\u7684\u7ebf\u7a0b\u800c\u662f\u53ef\u4ee5\u8de8\u7ebf\u7a0b\u5757\uff0c\u540c\u65f6\uff0c\u4e5f\u53ef\u4ee5\u662f\u7ebf\u7a0b\u5757\u5185\u7684\u67d0\u4e9b\u7ebf\u7a0b\u800c\u5e76\u975e\u6240\u6709\u7ebf\u7a0b\u3002\u8fd9\u6837\u7684\u8bbe\u8ba1\u5229\u7528\u4e0d\u540c\u9897\u7c92\u5ea6\u7684\u7ebf\u7a0b\u7ec4\u7ed9\u7a0b\u5e8f\u5458\u63d0\u4f9b\u4e86\u66f4\u52a0\u7075\u6d3b\u3001\u52a8\u6001\u7684\u7ebf\u7a0b\u5206\u7ec4\u3002<\/p>\n\n\n\n<p>\u4f46\u662fTom\u8868\u793a\uff0c\u534f\u4f5c\u7ec4\u4f5c\u4e3a\u66f4\u9ad8\u9636\u7684\u5185\u5bb9\u4f1a\u5728\u7a0d\u665a\u7684\u8bfe\u7a0b\u4e2d\u8bb2\u89e3\uff0c\u8fd9\u91cc\u4ec5\u4ec5\u4e00\u7b14\u5e26\u8fc7\u3002\u6211Google\u4e86\u4e00\u4e0b\u76f8\u5173\u8d44\u6599\uff0c\u5b98\u65b9\u7ed9\u51fa\u7684<a href=\"https:\/\/developer.nvidia.com\/blog\/cooperative-groups\/\"><strong><em><mark style=\"background-color:rgba(0, 0, 0, 0)\" class=\"has-inline-color has-vivid-cyan-blue-color\">\u6280\u672f\u535a\u5ba2<\/mark><\/em><\/strong><\/a>\u8bb2\u7684\u5341\u5206\u8be6\u7ec6\uff0c\u6709\u5174\u8da3\u53ef\u4ee5\u770b\u4e00\u4e0b\u3002<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><div class='fancybox-wrapper lazyload-container-unload' data-fancybox='post-images' href='https:\/\/www.qibinliang.com\/wp-content\/uploads\/2024\/10\/image-6-1024x557.png'><img class=\"lazyload lazyload-style-1\" src=\"data:image\/svg+xml;base64,PCEtLUFyZ29uTG9hZGluZy0tPgo8c3ZnIHdpZHRoPSIxIiBoZWlnaHQ9IjEiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgc3Ryb2tlPSIjZmZmZmZmMDAiPjxnPjwvZz4KPC9zdmc+\"  loading=\"lazy\" decoding=\"async\" width=\"1024\" height=\"557\" data-original=\"https:\/\/www.qibinliang.com\/wp-content\/uploads\/2024\/10\/image-6-1024x557.png\" src=\"data:image\/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsQAAA7EAZUrDhsAAAANSURBVBhXYzh8+PB\/AAffA0nNPuCLAAAAAElFTkSuQmCC\" alt=\"\" class=\"wp-image-336\" style=\"width:643px;height:auto\"  sizes=\"auto, (max-width: 1024px) 100vw, 1024px\" \/><\/div><figcaption class=\"wp-element-caption\">\u56fe4. \u534f\u4f5c\u7ec4<\/figcaption><\/figure>\n<\/div>\n\n\n<h3 class=\"wp-block-heading\">\u5171\u4eab\u5185\u5b58\u4e0e\u7f13\u5b58\uff08Shared Memory vs Cache\uff09<\/h3>\n\n\n\n<p>\u5171\u4eab\u5185\u5b58\uff08Shared Memory\uff09\u662f\u7531\u7528\u6237\u7ba1\u7406\u7684\u9ad8\u901f\u7f13\u5b58\uff0c\u800c\u7f13\u5b58\uff08Cache\uff09\u5219\u662f\u9690\u5f0f\u5730\u7ba1\u7406\u6570\u636e\u3002\u7f13\u5b58\u9075\u5faa\u7740\u67d0\u4e9b\u6a21\u5f0f\u53bb\u7ba1\u7406\u672a\u6765\u53ef\u80fd\u4f1a\u7528\u5230\u7684\u6570\u636e\uff0c\u4e0eCPU\u7f13\u5b58\u7c7b\u4f3c\uff0cGPU\u7f13\u5b58\u4e5f\u6709\u81ea\u5df1\u7684\u8c03\u5ea6\u7b97\u6cd5\u3002\u5728\u65e9\u4e9b\u5e74\u7684\u65f6\u5019\uff0cGPU\u662f\u6ca1\u6709\u7f13\u5b58\u7684\uff0c\u7528\u6237\u53ea\u80fd\u663e\u5f0f\u5730\u7ba1\u7406\u81ea\u5df1\u7684\u6570\u636e\u7f13\u5b58\uff0c\u56e0\u6b64\u7a0b\u5e8f\u7684\u6548\u7387\u4f1a\u56e0\u4e3a\u7a0b\u5e8f\u5458\u5bf9\u7b97\u6cd5\u7684\u7406\u89e3\u800c\u4ea7\u751f\u5de8\u5927\u7684\u5dee\u5f02\u3002<\/p>\n\n\n\n<p>\u968f\u7740\u65f6\u4ee3\u7684\u53d1\u5c55\uff0cNVIDIA\u7684GPU\u6dfb\u52a0\u4e86\u5927\u5c0f\u53ef\u89c2\u7684L1\u3001L2\u7f13\u5b58\uff0c\u4f8b\u5982Volta\u67b6\u6784\u7684GPU\u3002\u5728\u5982\u6b64\u5145\u6c9b\u7684\u7f13\u5b58\u7a7a\u95f4\u7684\u652f\u6301\u4e0b\uff0c\u6765\u81ea\u4e8e\u7cbe\u5999\u7ba1\u7406\u7684\u5171\u4eab\u5185\u5b58\u6240\u5e26\u4e86\u7684\u4f18\u52bf\u5c31\u4f1a\u88ab\u6781\u5927\u7684\u7f29\u5c0f\u3002\u56fe5\u5c55\u793a\u4e86Volta\u67b6\u6784\u4e0ePascal\u67b6\u6784\u4e0b\u5171\u4eab\u5185\u5b58\u6240\u5e26\u6765\u7684\u6027\u80fd\u63d0\u5347\u5bf9\u6bd4\uff0c\u76f8\u6bd4Pascal\uff0cVolta\u62e5\u6709\u66f4\u5927\u7f13\u5b58\uff0c\u56e0\u6b64\u5171\u4eab\u5185\u5b58\u6240\u5e26\u6765\u7684\u4f18\u52bf\u663e\u8457\u51cf\u5c11\u3002<\/p>\n\n\n\n<p>\u6b63\u5982\u5b98\u65b9\u8bba\u575b\u7684\u4e00\u7bc7<a href=\"https:\/\/forums.developer.nvidia.com\/t\/is-it-possible-to-use-l1-cache-instead-of-shared-memory-when-implementing-blocked-matmuls-in-cuda\/256985\/3\"><strong><em><mark style=\"background-color:rgba(0, 0, 0, 0)\" class=\"has-inline-color has-vivid-cyan-blue-color\">\u8ba8\u8bba\u5e16\u5b50<\/mark><\/em><\/strong><\/a>\u4e2d\uff0c\u4e00\u4f4d\u53eb\u505anjuffa\u7684\u8001\u54e5\u8bf4\u7684\uff1a<\/p>\n\n\n\n<blockquote class=\"wp-block-quote is-layout-flow wp-block-quote-is-layout-flow\">\n<p>I think it is fair to say that the importance of shared memory in CUDA programming has decreased with the advent of L1\/L2 caches of competitive size in GPUs<br>\u6211\u8ba4\u4e3a\u53ef\u4ee5\u5f88\u516c\u5e73\u5730\u8bb2\uff0c\u5171\u4eab\u5185\u5b58\u5728CUDA\u7f16\u7a0b\u4e2d\u7684\u5730\u4f4d\u968f\u7740\u66f4\u5177\u6709\u7ade\u4e89\u529b\u7684\u5927\u5c0f\u7684L1\/L2\u7f13\u5b58\u7684\u6765\u4e34\uff0c\u5df2\u7ecf\u5728\u964d\u4f4e\u3002<\/p>\n<\/blockquote>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><div class='fancybox-wrapper lazyload-container-unload' data-fancybox='post-images' href='https:\/\/www.qibinliang.com\/wp-content\/uploads\/2024\/10\/image-7-1024x572.png'><img class=\"lazyload lazyload-style-1\" src=\"data:image\/svg+xml;base64,PCEtLUFyZ29uTG9hZGluZy0tPgo8c3ZnIHdpZHRoPSIxIiBoZWlnaHQ9IjEiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgc3Ryb2tlPSIjZmZmZmZmMDAiPjxnPjwvZz4KPC9zdmc+\"  loading=\"lazy\" decoding=\"async\" width=\"1024\" height=\"572\" data-original=\"https:\/\/www.qibinliang.com\/wp-content\/uploads\/2024\/10\/image-7-1024x572.png\" src=\"data:image\/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsQAAA7EAZUrDhsAAAANSURBVBhXYzh8+PB\/AAffA0nNPuCLAAAAAElFTkSuQmCC\" alt=\"\" class=\"wp-image-338\" style=\"width:653px;height:auto\"  sizes=\"auto, (max-width: 1024px) 100vw, 1024px\" \/><\/div><figcaption class=\"wp-element-caption\">\u56fe5. \u4e0d\u540c\u67b6\u6784\u4e0b\u5171\u4eab\u5185\u5b58\u6027\u80fd\u63d0\u5347\u5bf9\u6bd4<\/figcaption><\/figure>\n<\/div>\n\n\n<h2 class=\"wp-block-heading\">\u8bfe\u7a0b\u7ec3\u4e60<\/h2>\n\n\n\n<details class=\"wp-block-details is-layout-flow wp-block-details-is-layout-flow\"><summary>hw2 &#8211; stencil1d<\/summary>\n<p>\u5b8c\u5584stencil1d\u7684\u4ee3\u7801\uff0c\u4ee3\u7801\u76f8\u5bf9\u7b80\u5355\u76f4\u767d\uff0c\u548c\u8bfe\u7a0b\u4e2d\u7684\u5185\u5bb9\u9ad8\u5ea6\u4e00\u81f4\uff0c\u6240\u4ee5\u5e76\u6ca1\u6709\u989d\u5916\u7684\u6536\u83b7\u3002<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>__global__ void stencil_1d(int *in, int *out) {\n    __shared__ int temp&#91;BLOCK_SIZE + 2*RADIUS];\n    int gindex = threadIdx.x + blockIdx.x * blockDim.x;\n    int lindex = <mark style=\"background-color:rgba(0, 0, 0, 0)\" class=\"has-inline-color has-luminous-vivid-orange-color\">threadIdx.x + RADIUS<\/mark>;\n\n    \/\/ Read input elements into shared memory\n    temp&#91;lindex] = in&#91;gindex];\n    if (threadIdx.x &lt; RADIUS) {\n      temp&#91;lindex - RADIUS] = in&#91;gindex - RADIUS];\n      temp&#91;lindex + BLOCK_SIZE] = in&#91;gindex + BLOCK_SIZE];\n    }\n\n    \/\/ Synchronize (ensure all the data is available)\n    __syncthreads();\n\n    \/\/ Apply the stencil\n    int result = 0;\n    for (int offset = -RADIUS; offset &lt;= RADIUS; offset++)\n      result += temp&#91;<mark style=\"background-color:rgba(0, 0, 0, 0)\" class=\"has-inline-color has-luminous-vivid-orange-color\">lindex + offset<\/mark>];\n\n    \/\/ Store the result\n    out&#91;gindex] = result;\n}\n<\/code><\/pre>\n<\/details>\n\n\n\n<details class=\"wp-block-details is-layout-flow wp-block-details-is-layout-flow\"><summary>hw2 &#8211; matrix_mul_shared<\/summary>\n<p>\u8fd9\u4e2a\u5176\u5b9e\u4e0e\u7b2c\u4e00\u7ae0\u8282\u4e2dhw1 &#8211; matrix_mul\u7684\u4ee3\u7801\u5dee\u8ddd\u4e0d\u5927\uff0c\u5173\u952e\u5728\u4e8e\u8fd9\u6b21\u7ec3\u4e60\u4f7f\u7528\u4e86\u5171\u4eab\u5185\u5b58\uff0c\u5c06\u5185\u79ef\u9700\u8981\u7684\u884c\u5217\u62c6\u6210\u82e5\u5e72\u4e2a\u7ebf\u7a0b\u5757\uff0c\u7136\u540e\u6bcf\u4e2a\u7ebf\u7a0b\u5757\u8bb0\u5f55\u4e00\u4efd\u5171\u4eab\u5185\u5b58\u3002\u5f00\u59cb\u5199\u7684\u65f6\u5019\u591a\u5c11\u6709\u70b9\u4e0d\u4e60\u60ef\uff0c\u56e0\u4e3a\u5bf9\u4e8e\u8f93\u51fa\u77e9\u9635\u7684\u6bcf\u4e2a\u5143\u7d20\u6765\u8bf4\uff0c\u90fd\u8981\u7ba1\u7406As\u3001Bs\u4e24\u5757\u5171\u4eab\u5185\u5b58\uff0c\u4f46\u662f\u8fd9\u4e24\u5757\u8d21\u732e\u5185\u5b58\u4e2d\u53ea\u6709\u5176\u4e2d\u7684\u7b2cthreadIdx.x\u884c\u4e0e\u7b2cthreadIdx.y\u5217\u4f1a\u88ab\u4f7f\u7528\uff0c\u5c31\u8ba9\u6211\u89c9\u5f97\u8fd9\u4e2a\u5171\u4eab\u5185\u5b58\u5b58\u50a8\u4e86\u5f88\u591a\u65e0\u7528\u7684\u6570\u636e\u3002<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>__global__ void mmul(const float *A, const float *B, float *C, int ds) {\n\n  \/\/ declare cache in shared memory\n  __shared__ float As&#91;block_size]&#91;block_size];\n  __shared__ float Bs&#91;block_size]&#91;block_size];\n\n  int idx = threadIdx.x+blockDim.x*blockIdx.x; \/\/ create thread x index\n  int idy = threadIdx.y+blockDim.y*blockIdx.y; \/\/ create thread y index\n\n  if ((idx &lt; ds) &amp;&amp; (idy &lt; ds)){\n    float temp = 0;\n    for (int i = 0; i &lt; ds\/block_size; i++) {\n\n      \/\/ Load data into shared memory\n     <mark style=\"background-color:rgba(0, 0, 0, 0)\" class=\"has-inline-color has-luminous-vivid-orange-color\"> As&#91;threadIdx.y]&#91;threadIdx.x] = A&#91;idy * ds + (i * block_size + threadIdx.x)];\n      Bs&#91;threadIdx.y]&#91;threadIdx.x] = B&#91;(i * block_size + threadIdx.y) * ds + idx];<\/mark>\n\n      \/\/ Synchronize\n      __syncthreads();\n\n      \/\/ Keep track of the running sum\n      for (int k = 0; k &lt; block_size; k++)\n      \t<mark style=\"background-color:rgba(0, 0, 0, 0)\" class=\"has-inline-color has-luminous-vivid-orange-color\">temp += As&#91;threadIdx.y]&#91;k] * Bs&#91;k]&#91;threadIdx.x]; \/\/ dot product of row and column<\/mark>\n      __syncthreads();\n\n    }\n\n    \/\/ Write to global memory\n    C&#91;idy*ds+idx] = temp;\n  }\n}<\/code><\/pre>\n\n\n\n<p>\u6700\u540e\u6211\u4eec\u6765\u5bf9\u6bd4\u4e0b\u4f7f\u7528\u5171\u4eab\u5185\u5b58\u4e0e\u5168\u5c40\u5185\u5b58\u7684\u4ee3\u7801\u6267\u884c\u6548\u7387\u7684\u533a\u522b\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code><strong>\u5168\u5c40\u5185\u5b58<\/strong>\n(base) qibin:~\/projects\/cuda-training-series\/exercises\/hw1$ .\/matrix_mul \nInit took 0.161087 seconds.  Begin compute\nDone. Compute took <mark style=\"background-color:rgba(0, 0, 0, 0)\" class=\"has-inline-color has-vivid-red-color\">4.861984<\/mark> seconds\nSuccess!\n\n<strong>\u5171\u4eab\u5185\u5b58<\/strong>\n(base) qibin:~\/projects\/cuda-training-series\/exercises\/hw2$ .\/matrix_mul_shared \nInit took 0.157803 seconds.  Begin compute\nDone. Compute took <mark style=\"background-color:rgba(0, 0, 0, 0)\" class=\"has-inline-color has-vivid-red-color\">1.450261<\/mark> seconds\nSuccess!<\/code><\/pre>\n\n\n\n<p>\u4ece\u7ed3\u679c\u4e0a\u6765\u770b\u6709\u8db3\u8db32.4\u500d\u901f\u5ea6\u7684\u63d0\u5347\uff0c\u5dee\u4e0d\u591a70%\u7684\u76f8\u5bf9\u63d0\u5347\uff0c\u76f8\u5f53\u7684\u4e0d\u9519\u3002<\/p>\n<\/details>\n\n\n\n<p><\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u8bfe\u7a0b\u5185\u5bb9\uff08CUDA Shared Memory\uff09 \u6b64\u6b21\u8bb2\u5ea7\u662fOLCF CUDA\u57f9\u8bad\u7cfb\u5217\u7684\u7b2c\u4e8c\u90e8\u5206\uff0c\u91cd\u70b9\u8bb2\u89e3\u4e86\u5982 [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[35],"tags":[33,27,31,29],"class_list":["post-321","post","type-post","status-publish","format-standard","hentry","category-35","tag-c","tag-cuda","tag-hip","tag-29"],"_links":{"self":[{"href":"https:\/\/www.qibinliang.com\/index.php\/wp-json\/wp\/v2\/posts\/321","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.qibinliang.com\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.qibinliang.com\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.qibinliang.com\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.qibinliang.com\/index.php\/wp-json\/wp\/v2\/comments?post=321"}],"version-history":[{"count":20,"href":"https:\/\/www.qibinliang.com\/index.php\/wp-json\/wp\/v2\/posts\/321\/revisions"}],"predecessor-version":[{"id":346,"href":"https:\/\/www.qibinliang.com\/index.php\/wp-json\/wp\/v2\/posts\/321\/revisions\/346"}],"wp:attachment":[{"href":"https:\/\/www.qibinliang.com\/index.php\/wp-json\/wp\/v2\/media?parent=321"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.qibinliang.com\/index.php\/wp-json\/wp\/v2\/categories?post=321"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.qibinliang.com\/index.php\/wp-json\/wp\/v2\/tags?post=321"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}