{"id":16556,"date":"2024-08-06T15:11:48","date_gmt":"2024-08-06T07:11:48","guid":{"rendered":"http:\/\/139.9.1.231\/?p=16556"},"modified":"2024-10-22T14:02:32","modified_gmt":"2024-10-22T06:02:32","slug":"ddp-dataloader","status":"publish","type":"post","link":"http:\/\/139.9.1.231\/index.php\/2024\/08\/06\/ddp-dataloader\/","title":{"rendered":"DDP\u5206\u5e03\u5f0f\u8bad\u7ec3&#8211;\u6570\u636e\u52a0\u8f7d\u548c\u8bad\u7ec3NCCL"},"content":{"rendered":"\n<p>        \u6df1\u5ea6\u5b66\u4e60\u7684\u53d1\u5c55\u8bc1\u660e\u4e86\u5927\u6570\u636e\u548c\u5927\u6a21\u578b\u7684\u4ef7\u503c\u3002\u65e0\u8bba\u662f\u5728CV\u8fd8\u662fNLP\u9886\u57df\uff0c\u5728\u5927\u89c4\u6a21\u7684\u8ba1\u7b97\u8d44\u6e90\u4e0a\u8bad\u7ec3\u6a21\u578b\u7684\u80fd\u529b\u53d8\u5f97\u65e5\u76ca\u91cd\u8981\u3002GPU\u4ee5\u6bd4CPU\u66f4\u5feb\u7684\u77e9\u9635\u4e58\u6cd5\u548c\u52a0\u6cd5\u8fd0\u7b97\uff0c\u52a0\u901f\u4e86\u6a21\u578b\u8bad\u7ec3\u3002\u4f46\u968f\u7740\u6570\u636e\u91cf\u548c\u6a21\u578b\u53c2\u6570\u7684\u589e\u957f\uff0c\u5355\u5757GPU\u5f88\u5feb\u53d8\u5f97\u4e0d\u591f\u7528\u3002\u56e0\u6b64\u6211\u4eec\u5fc5\u987b\u627e\u5230\u5408\u9002\u7684\u65b9\u6cd5\uff0c\u5b9e\u73b0\u6570\u636e\u548c\u6a21\u578b\u5728\u591a\u4e2aGPU\u751a\u81f3\u591a\u4e2a\u8ba1\u7b97\u8282\u70b9\u95f4\u7684\u5212\u5206\u548c\u590d\u5236\uff0c\u4ece\u800c\u5b9e\u73b0\u66f4\u77ed\u7684\u8bad\u7ec3\u5468\u671f\u548c\u66f4\u5927\u7684\u6a21\u578b\u53c2\u6570\u91cf\u3002<\/p>\n\n\n\n\n\n<p id=\"ZH-CN_TOPIC_0000001402314412__p11162773315\">DDP\u5927\u81f4\u7684\u6d41\u7a0b\u5982\u4e0b\uff1a<\/p>\n\n\n\n<ol id=\"ZH-CN_TOPIC_0000001402314412__ol1447612113112\"><li>\u521d\u59cb\u5316\u8fdb\u7a0b\u7ec4\u3002<\/li><li>\u521b\u5efa\u5206\u5e03\u5f0f\u5e76\u884c\u6a21\u578b\uff0c\u6bcf\u4e2a\u8fdb\u7a0b\u90fd\u4f1a\u6709\u76f8\u540c\u7684\u6a21\u578b\u548c\u53c2\u6570\u3002<\/li><li><strong>\u521b\u5efa\u6570\u636e\u5206\u53d1Sampler\uff0c\u4f7f\u6bcf\u4e2a\u8fdb\u7a0b\u52a0\u8f7d\u4e00\u4e2amini batch\u4e2d\u4e0d\u540c\u90e8\u5206\u7684\u6570\u636e\u3002<\/strong><\/li><li>\u7f51\u7edc\u4e2d\u76f8\u90bb\u53c2\u6570\u5206\u6876\uff0c\u4e00\u822c\u4e3a\u795e\u7ecf\u7f51\u7edc\u6a21\u578b\u4e2d\u9700\u8981\u8fdb\u884c\u53c2\u6570\u66f4\u65b0\u7684\u6bcf\u4e00\u5c42\u7f51\u7edc\u3002<\/li><li>\u6bcf\u4e2a\u8fdb\u7a0b\u524d\u5411\u4f20\u64ad\u5e76\u5404\u81ea\u8ba1\u7b97\u68af\u5ea6\u3002<\/li><li>\u6a21\u578b\u67d0\u4e00\u5c42\u7684\u53c2\u6570\u5f97\u5230\u68af\u5ea6\u540e\u4f1a\u9a6c\u4e0a\u8fdb\u884c\u901a\u8baf\u5e76\u8fdb\u884c\u68af\u5ea6\u5e73\u5747\u3002<\/li><li>\u5404GPU\u66f4\u65b0\u6a21\u578b\u53c2\u6570\u3002<\/li><\/ol>\n\n\n\n<figure class=\"wp-block-image\"><img src=\"https:\/\/support.huaweicloud.com\/intl\/zh-cn\/develop-modelarts\/figure\/zh-cn_image_0000001401994692.png\" alt=\"\"\/><\/figure>\n\n\n\n<p class=\"has-light-pink-background-color has-background\"><strong>\u4eca\u5929\u4e3b\u8981\u6765\u7814\u7a76 3\u521b\u5efa\u6570\u636e\u5206\u53d1\u548cSampler \uff1a\u4e3b\u8981\u7531\u4e09\u90e8\u5206\u7ec4\u6210\uff1atorch.utils.data.Dataset\u3010\u53ef\u4ee5\u81ea\u5b9a\u4e49\u3011\u3001torch.utils.data.DataLoader\u3001\u4ee5\u53catorch.utils.data.distributed.DistributedSampler\u3010\u53ef\u4ee5\u81ea\u5df1\u5b9a\u4e49\u3011\u3002<\/strong><\/p>\n\n\n\n<p><code>DistributedSampler<\/code> \u786e\u4fdd\u6bcf\u4e2a\u8fdb\u7a0b\uff08\u6216 GPU\uff09\u5904\u7406\u6570\u636e\u96c6\u7684\u4e0d\u540c\u90e8\u5206\u3002<code>DataLoader<\/code> \u4f7f\u7528 <code>DistributedSampler<\/code> \u751f\u6210\u7684\u6570\u636e\u7d22\u5f15\u6765\u5206\u6279\u6570\u636e\uff0c\u5e76\u8fdb\u884c\u6570\u636e\u52a0\u8f7d\u548c\u9884\u5904\u7406\u3002<\/p>\n\n\n\n<h3>1\u3001 Dataset \uff1a<\/h3>\n\n\n\n<p><code>Dataset<\/code> \u662f\u4e00\u4e2a\u62bd\u8c61\u7c7b\uff0c\u7528\u4e8e\u8868\u793a\u6570\u636e\u96c6\u3002\u4f60\u9700\u8981\u7ee7\u627f\u8fd9\u4e2a\u7c7b\u5e76\u5b9e\u73b0\u5176\u65b9\u6cd5\uff0c\u4ee5\u5b9a\u4e49\u4f60\u81ea\u5df1\u7684\u6570\u636e\u96c6\u3002\u5b83\u7684\u4e3b\u8981\u529f\u80fd\u5305\u62ec\uff1a<\/p>\n\n\n\n<ul><li><strong>\u5b9a\u4e49\u6570\u636e\u8bbf\u95ee<\/strong>\uff1a\u901a\u8fc7\u5b9e\u73b0 <code>__getitem__<\/code> \u65b9\u6cd5\uff0c\u5b9a\u4e49\u5982\u4f55\u8bbf\u95ee\u6570\u636e\u96c6\u4e2d\u5355\u4e2a\u6570\u636e\u9879\u3002<\/li><li><strong>\u6570\u636e\u96c6\u5927\u5c0f<\/strong>\uff1a\u901a\u8fc7\u5b9e\u73b0 <code>__len__<\/code> \u65b9\u6cd5\uff0c\u8fd4\u56de\u6570\u636e\u96c6\u4e2d\u6837\u672c\u7684\u603b\u6570\u3002<\/li><\/ul>\n\n\n\n<pre class=\"wp-block-code\"><code>class MyDataset(torch.utils.data.Dataset):\n    def __init__(self, data):\n        self.data = data\n\n    def __len__(self):\n        return len(self.data)\n\n    def __getitem__(self, index):\n        return self.data&#091;index]\n<\/code><\/pre>\n\n\n\n<h3>2\u3001DataLoader\uff1a<\/h3>\n\n\n\n<p><code>DataLoader<\/code> \u662f\u4e00\u4e2a\u6570\u636e\u52a0\u8f7d\u5668\uff0c\u5b83\u8d1f\u8d23\u4ece <code>Dataset<\/code> \u4e2d\u6279\u91cf\u52a0\u8f7d\u6570\u636e\u3002\u5b83\u63d0\u4f9b\u4e86\u5bf9\u6570\u636e\u7684\u6279\u91cf\u5904\u7406\u3001\u968f\u673a\u6253\u4e71\u3001\u5e76\u884c\u52a0\u8f7d\u7b49\u529f\u80fd\u3002<code>DataLoader<\/code> \u4e3b\u8981\u529f\u80fd\u5305\u62ec\uff1a<\/p>\n\n\n\n<ul><li><strong>\u6279\u91cf\u52a0\u8f7d<\/strong>\uff1a\u5c06\u6570\u636e\u96c6\u5206\u6210\u591a\u4e2a\u6279\u6b21\uff0c\u5e76\u5728\u6bcf\u6b21\u8fed\u4ee3\u4e2d\u8fd4\u56de\u4e00\u4e2a\u6279\u6b21\u7684\u6570\u636e\u3002<\/li><li><strong>\u5e76\u884c\u5904\u7406<\/strong>\uff1a\u4f7f\u7528\u591a\u4e2a\u5de5\u4f5c\u7ebf\u7a0b\uff08<code>num_workers<\/code>\uff09\u6765\u5e76\u884c\u52a0\u8f7d\u6570\u636e\uff0c\u63d0\u9ad8\u6570\u636e\u52a0\u8f7d\u901f\u5ea6\u3002<\/li><li><strong>\u6570\u636e\u6253\u4e71<\/strong>\uff1a\u901a\u8fc7 <code>shuffle<\/code> \u53c2\u6570\u6765\u968f\u673a\u6253\u4e71\u6570\u636e\u987a\u5e8f\u3002<\/li><li><strong>\u81ea\u52a8\u5904\u7406\u6837\u672c<\/strong>\uff1a\u4f7f\u7528 <code>collate_fn<\/code> \u5c06\u5355\u4e2a\u6837\u672c\u7ec4\u5408\u6210\u6279\u6b21\u3002<\/li><\/ul>\n\n\n\n<p>1. <strong>\u6570\u636e\u52a0\u8f7d\u548c\u9884\u5904\u7406<\/strong><\/p>\n\n\n\n<p><code>DataLoader<\/code> \u8d1f\u8d23\u4ece\u6570\u636e\u96c6\uff08<code>Dataset<\/code>\uff09\u4e2d\u52a0\u8f7d\u6570\u636e\uff0c\u5e76\u8fdb\u884c\u5fc5\u8981\u7684\u9884\u5904\u7406\u64cd\u4f5c\u3002\u9884\u5904\u7406\u53ef\u80fd\u5305\u62ec\u6570\u636e\u589e\u5f3a\u3001\u5f52\u4e00\u5316\u7b49\u3002\u5b83\u901a\u8fc7\u591a\u7ebf\u7a0b\u6216\u591a\u8fdb\u7a0b\u7684\u65b9\u5f0f\u5e76\u884c\u52a0\u8f7d\u6570\u636e\uff0c\u51cf\u5c11\u4e86\u6570\u636e\u52a0\u8f7d\u65f6\u95f4\u3002<\/p>\n\n\n\n<ul><li><strong><code>num_workers<\/code><\/strong>\uff1a\u6307\u5b9a\u7528\u4e8e\u6570\u636e\u52a0\u8f7d\u7684\u5b50\u8fdb\u7a0b\u6570\uff0c\u5e2e\u52a9\u52a0\u5feb\u6570\u636e\u52a0\u8f7d\u901f\u5ea6\u3002<\/li><\/ul>\n\n\n\n<p>2. <strong>\u6570\u636e\u5206\u6279<\/strong><\/p>\n\n\n\n<p><code>DataLoader<\/code> \u5c06\u6570\u636e\u96c6\u5212\u5206\u4e3a\u591a\u4e2a\u6279\u6b21\uff08batches\uff09\uff0c\u4ee5\u4fbf\u4e8e\u6a21\u578b\u8fdb\u884c\u8bad\u7ec3\u548c\u8bc4\u4f30\u3002\u6279\u6b21\u7684\u5927\u5c0f\u53ef\u4ee5\u901a\u8fc7 <code>batch_size<\/code> \u53c2\u6570\u8fdb\u884c\u8bbe\u7f6e\u3002<\/p>\n\n\n\n<ul><li><strong><code>batch_size<\/code><\/strong>\uff1a\u6bcf\u4e2a\u6279\u6b21\u7684\u6570\u636e\u91cf\uff0c\u8fd9\u5bf9\u4e8e\u8bad\u7ec3\u8fc7\u7a0b\u4e2d\u6bcf\u6b21\u8fed\u4ee3\u7684\u6570\u636e\u91cf\u975e\u5e38\u91cd\u8981\u3002<\/li><\/ul>\n\n\n\n<p>3. <strong>\u5206\u5e03\u5f0f\u8bad\u7ec3\u4e2d\u7684\u6570\u636e\u5212\u5206<\/strong><\/p>\n\n\n\n<p>\u5728 DDP \u4e0b\uff0c<code>DataLoader<\/code> \u7ed3\u5408 <code>Sampler<\/code> \u6765\u786e\u4fdd\u6570\u636e\u5728\u5404\u4e2a\u8fdb\u7a0b\u4e4b\u95f4\u7684\u6b63\u786e\u5206\u914d\u3002<code>Sampler<\/code> \u63a7\u5236\u6bcf\u4e2a\u8fdb\u7a0b\uff08\u6216 GPU\uff09\u83b7\u5f97\u6570\u636e\u96c6\u7684\u54ea\u4e00\u90e8\u5206\u3002<\/p>\n\n\n\n<ul><li><strong><code>DistributedSampler<\/code><\/strong>\uff1a\u5f53\u8fdb\u884c\u5206\u5e03\u5f0f\u8bad\u7ec3\u65f6\uff0c<code>DistributedSampler<\/code> \u786e\u4fdd\u6bcf\u4e2a\u8fdb\u7a0b\u5904\u7406\u4e0d\u540c\u7684\u6570\u636e\u5b50\u96c6\uff0c\u4ece\u800c\u5b9e\u73b0\u8d1f\u8f7d\u5747\u8861\u548c\u907f\u514d\u6570\u636e\u91cd\u590d\u3002<\/li><\/ul>\n\n\n\n<p>4. <strong>\u6570\u636e\u7684\u6253\u4e71\u548c\u987a\u5e8f<\/strong><\/p>\n\n\n\n<p>\u4e3a\u4e86\u63d0\u9ad8\u6a21\u578b\u7684\u6cdb\u5316\u80fd\u529b\uff0c\u6570\u636e\u901a\u5e38\u5728\u6bcf\u4e2a epoch \u5f00\u59cb\u65f6\u88ab\u6253\u4e71\u3002<code>DataLoader<\/code> \u63d0\u4f9b\u4e86\u6253\u4e71\u6570\u636e\u7684\u529f\u80fd\uff0c\u8fd9\u5bf9\u4e8e\u8bad\u7ec3\u8fc7\u7a0b\u662f\u975e\u5e38\u91cd\u8981\u7684\u3002<\/p>\n\n\n\n<ul><li><strong><code>shuffle<\/code><\/strong>\uff1a\u6307\u5b9a\u662f\u5426\u5728\u6bcf\u4e2a epoch \u5f00\u59cb\u65f6\u6253\u4e71\u6570\u636e\uff0c\u8fd9\u6709\u52a9\u4e8e\u51cf\u5c11\u6a21\u578b\u5bf9\u6570\u636e\u987a\u5e8f\u7684\u8fc7\u62df\u5408\u3002<\/li><\/ul>\n\n\n\n<p>5. <strong>\u6279\u6b21\u4e22\u5f03<\/strong><\/p>\n\n\n\n<p>\u5728\u8bad\u7ec3\u8fc7\u7a0b\u4e2d\uff0c\u5982\u679c\u6700\u540e\u4e00\u4e2a\u6279\u6b21\u7684\u6837\u672c\u6570\u4e0d\u8db3\u4ee5\u6784\u6210\u5b8c\u6574\u7684\u6279\u6b21\uff0c\u53ef\u4ee5\u9009\u62e9\u4e22\u5f03\u8fd9\u4e2a\u6279\u6b21\uff0c\u4ee5\u4fdd\u8bc1\u6bcf\u4e2a\u6279\u6b21\u7684\u5927\u5c0f\u4e00\u81f4\u3002<\/p>\n\n\n\n<ul><li><strong><code>drop_last<\/code><\/strong>\uff1a\u6307\u5b9a\u662f\u5426\u4e22\u5f03\u6700\u540e\u4e00\u4e2a\u6279\u6b21\uff08\u5982\u679c\u5176\u5927\u5c0f\u5c0f\u4e8e <code>batch_size<\/code>\uff09\u3002<\/li><\/ul>\n\n\n\n<p>6. <strong>\u4e0e <code>Sampler<\/code> \u7ed3\u5408\u4f7f\u7528<\/strong><\/p>\n\n\n\n<p><code>DataLoader<\/code> \u53ef\u4ee5\u4e0e\u4e0d\u540c\u7684 <code>Sampler<\/code> \u7ed3\u5408\u4f7f\u7528\uff0c\u4ee5\u652f\u6301\u5404\u79cd\u6570\u636e\u52a0\u8f7d\u7b56\u7565\u3002\u5728 DDP \u4e0b\uff0c<code>DistributedSampler<\/code> \u662f\u5e38\u7528\u7684 <code>Sampler<\/code>\uff0c\u5b83\u5c06\u6570\u636e\u96c6\u5212\u5206\u4e3a\u591a\u4e2a\u5b50\u96c6\uff0c\u6bcf\u4e2a\u8fdb\u7a0b\u5904\u7406\u4e00\u4e2a\u5b50\u96c6\u3002<\/p>\n\n\n\n<ul><li><strong><code>batch_sampler<\/code><\/strong>\uff1a\u5982\u679c\u4f7f\u7528\u81ea\u5b9a\u4e49\u7684 <code>Sampler<\/code>\uff0c\u53ef\u4ee5\u5c06\u5176\u4f20\u9012\u7ed9 <code>batch_sampler<\/code> \u53c2\u6570\u6765\u63a7\u5236\u6570\u636e\u7684\u5206\u6279\u65b9\u5f0f\u3002<\/li><\/ul>\n\n\n\n<p>data = [1, 2, 3, 4, 5]<br>dataset = MyDataset(data)<br>dataloader = torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=True, num_workers=2)<\/p>\n\n\n\n<p>for batch in dataloader:<br>print(batch)<\/p>\n\n\n\n<h3>3\u3001DistributedSampler\uff1a<\/h3>\n\n\n\n<p><code>DistributedSampler<\/code> \u7528\u4e8e\u5728\u5206\u5e03\u5f0f\u8bad\u7ec3\u4e2d\u5bf9\u6570\u636e\u8fdb\u884c\u91c7\u6837\u3002\u5b83\u7684\u4e3b\u8981\u4f5c\u7528\u662f\u786e\u4fdd\u6bcf\u4e2a\u8fdb\u7a0b\uff08\u6216 GPU\uff09\u5728\u5206\u5e03\u5f0f\u8bad\u7ec3\u4e2d\u83b7\u5f97\u6570\u636e\u7684\u4e0d\u540c\u5b50\u96c6\uff0c\u4ece\u800c\u907f\u514d\u6570\u636e\u91cd\u590d\u548c\u786e\u4fdd\u6570\u636e\u5747\u5300\u5206\u914d\u3002\u4e3b\u8981\u529f\u80fd\u5305\u62ec\uff1a<\/p>\n\n\n\n<ul><li><strong>\u5206\u5e03\u5f0f\u6570\u636e\u5206\u914d<\/strong>\uff1a\u6839\u636e\u8fdb\u7a0b\u7684 rank \u548c\u603b\u8fdb\u7a0b\u6570\uff0c\u8ba1\u7b97\u51fa\u6bcf\u4e2a\u8fdb\u7a0b\u5e94\u8be5\u5904\u7406\u7684\u6570\u636e\u5b50\u96c6\u3002<\/li><li><strong>\u968f\u673a\u6253\u4e71<\/strong>\uff1a\u652f\u6301\u5728\u6bcf\u4e2a epoch \u91cd\u65b0\u6253\u4e71\u6570\u636e\uff0c\u4ee5\u589e\u52a0\u8bad\u7ec3\u7684\u968f\u673a\u6027\u3002<\/li><li><strong>\u540c\u6b65<\/strong>\uff1a\u5728\u591a\u4e2a\u8fdb\u7a0b\u4e4b\u95f4\u534f\u8c03\u6570\u636e\u7684\u91c7\u6837\u3002<\/li><\/ul>\n\n\n\n<p>1. <strong>\u6570\u636e\u5206\u914d<\/strong><\/p>\n\n\n\n<p>\u5728\u5206\u5e03\u5f0f\u8bad\u7ec3\u4e2d\uff0c\u6570\u636e\u96c6\u88ab\u5212\u5206\u6210\u591a\u4e2a\u5b50\u96c6\uff0c\u6bcf\u4e2a\u8fdb\u7a0b\uff08\u6216 GPU\uff09\u5904\u7406\u6570\u636e\u96c6\u7684\u4e00\u90e8\u5206\u3002<code>Sampler<\/code> \u786e\u4fdd\u6bcf\u4e2a\u8fdb\u7a0b\uff08\u6216 GPU\uff09\u5f97\u5230\u4e0d\u540c\u7684\u6570\u636e\u5b50\u96c6\uff0c\u4ee5\u907f\u514d\u91cd\u590d\u548c\u6570\u636e\u4e22\u5931\u3002<\/p>\n\n\n\n<ul><li><strong>DistributedSampler<\/strong>\uff1a\u8fd9\u662f PyTorch \u63d0\u4f9b\u7684\u4e13\u95e8\u7528\u4e8e\u5206\u5e03\u5f0f\u8bad\u7ec3\u7684\u91c7\u6837\u5668\u3002\u5b83\u6839\u636e\u5f53\u524d\u8fdb\u7a0b\u7684 <code>rank<\/code> \u548c\u603b\u8fdb\u7a0b\u6570 <code>num_replicas<\/code> \u6765\u5212\u5206\u6570\u636e\u96c6\u3002\u6bcf\u4e2a\u8fdb\u7a0b\u83b7\u5f97\u6570\u636e\u96c6\u7684\u4e0d\u540c\u90e8\u5206\uff0c\u4ece\u800c\u5b9e\u73b0\u6570\u636e\u7684\u6709\u6548\u5206\u914d\u548c\u8d1f\u8f7d\u5747\u8861\u3002<\/li><\/ul>\n\n\n\n<p>2. <strong>\u786e\u4fdd\u6570\u636e\u8986\u76d6<\/strong><\/p>\n\n\n\n<p>\u5728\u6bcf\u4e2a epoch \u4e2d\uff0c\u6bcf\u4e2a\u8fdb\u7a0b\u9700\u8981\u83b7\u53d6\u6570\u636e\u96c6\u7684\u4e0d\u540c\u90e8\u5206\uff0c\u4ee5\u786e\u4fdd\u6574\u4e2a\u6570\u636e\u96c6\u88ab\u8986\u76d6\u3002<code>Sampler<\/code> \u53ef\u4ee5\u5e2e\u52a9\u5b9e\u73b0\u8fd9\u79cd\u6570\u636e\u5206\u914d\u7b56\u7565\uff0c\u907f\u514d\u6570\u636e\u9057\u6f0f\u548c\u5197\u4f59\u3002<\/p>\n\n\n\n<ul><li><strong>\u968f\u673a\u6253\u4e71<\/strong>\uff1a<code>DistributedSampler<\/code> \u8fd8\u652f\u6301\u5728\u6bcf\u4e2a epoch \u5f00\u59cb\u65f6\u6253\u4e71\u6570\u636e\u96c6\uff0c\u8fd9\u5bf9\u4e8e\u8bad\u7ec3\u6a21\u578b\u5177\u6709\u66f4\u597d\u7684\u6cdb\u5316\u80fd\u529b\u662f\u975e\u5e38\u91cd\u8981\u7684\u3002<\/li><\/ul>\n\n\n\n<p>3. <strong>\u907f\u514d\u6570\u636e\u91cd\u590d<\/strong><\/p>\n\n\n\n<p>\u5982\u679c\u4e0d\u4f7f\u7528\u5408\u9002\u7684 <code>Sampler<\/code>\uff0c\u591a\u4e2a\u8fdb\u7a0b\u53ef\u80fd\u4f1a\u5904\u7406\u76f8\u540c\u7684\u6570\u636e\uff0c\u4ece\u800c\u5bfc\u81f4\u6570\u636e\u91cd\u590d\u3002\u8fd9\u4e0d\u4ec5\u6d6a\u8d39\u8ba1\u7b97\u8d44\u6e90\uff0c\u8fd8\u53ef\u80fd\u5f71\u54cd\u6a21\u578b\u7684\u8bad\u7ec3\u6548\u679c\u3002<\/p>\n\n\n\n<ul><li><strong>\u53bb\u91cd<\/strong>\uff1a<code>DistributedSampler<\/code> \u786e\u4fdd\u6bcf\u4e2a\u8fdb\u7a0b\u4ec5\u5904\u7406\u6570\u636e\u96c6\u7684\u4e00\u90e8\u5206\uff0c\u4ece\u800c\u907f\u514d\u6570\u636e\u91cd\u590d\u3002<\/li><\/ul>\n\n\n\n<p>4. <strong>\u9002\u5e94\u6279\u91cf\u5927\u5c0f<\/strong><\/p>\n\n\n\n<p>\u5728\u5206\u5e03\u5f0f\u8bad\u7ec3\u4e2d\uff0c\u6570\u636e\u7684\u5206\u914d\u548c\u6279\u5904\u7406\u9700\u8981\u9002\u5e94\u5206\u5e03\u5f0f\u73af\u5883\u4e2d\u7684\u6279\u91cf\u5927\u5c0f\u3002<code>Sampler<\/code> \u8d1f\u8d23\u5c06\u6570\u636e\u5206\u6210\u9002\u5408\u8bad\u7ec3\u7684\u6279\u6b21\uff0c\u5e76\u786e\u4fdd\u6bcf\u4e2a\u8fdb\u7a0b\u5904\u7406\u7684\u6570\u636e\u91cf\u4e0e\u5176\u4ed6\u8fdb\u7a0b\u4e00\u81f4\u3002<\/p>\n\n\n\n<ul><li><strong>BatchSampler<\/strong>\uff1a<code>BatchSampler<\/code> \u5c06\u7531 <code>Sampler<\/code> \u751f\u6210\u7684\u7d22\u5f15\u5217\u8868\u5206\u6210\u6279\u6b21\uff0c\u4ee5\u4fbf\u7528\u4e8e\u8bad\u7ec3\u3002\u5b83\u4e0e <code>DistributedSampler<\/code> \u7ed3\u5408\u4f7f\u7528\u65f6\uff0c\u53ef\u4ee5\u786e\u4fdd\u6bcf\u4e2a\u8fdb\u7a0b\u5904\u7406\u7684\u6570\u636e\u6279\u6b21\u7b26\u5408\u9884\u671f\u7684\u6279\u91cf\u5927\u5c0f\u3002<\/li><\/ul>\n\n\n\n<p>5. <strong>\u652f\u6301\u591a\u6837\u672c\u5904\u7406\u7b56\u7565<\/strong><\/p>\n\n\n\n<p>\u4e0d\u540c\u7684\u4efb\u52a1\u548c\u6a21\u578b\u53ef\u80fd\u9700\u8981\u4e0d\u540c\u7684\u6570\u636e\u5904\u7406\u7b56\u7565\uff0c\u5982\u6392\u5e8f\u3001\u52a8\u6001\u91c7\u6837\u7b49\u3002\u901a\u8fc7\u81ea\u5b9a\u4e49 <code>Sampler<\/code>\uff0c\u53ef\u4ee5\u5b9e\u73b0\u7279\u5b9a\u7684\u91c7\u6837\u7b56\u7565\u4ee5\u6ee1\u8db3\u4efb\u52a1\u9700\u6c42\u3002<\/p>\n\n\n\n<ul><li><strong>\u81ea\u5b9a\u4e49\u91c7\u6837\u5668<\/strong>\uff1a\u53ef\u4ee5\u5b9e\u73b0\u81ea\u5b9a\u4e49\u7684 <code>Sampler<\/code> \u7c7b\uff0c\u6765\u6ee1\u8db3\u7279\u5b9a\u7684\u9700\u6c42\uff0c\u5982\u6309\u6837\u672c\u957f\u5ea6\u6392\u5e8f\u3001\u52a8\u6001\u8c03\u6574\u6279\u6b21\u5927\u5c0f\u7b49\u3002<\/li><\/ul>\n\n\n\n<pre class=\"wp-block-code\"><code>sampler = torch.utils.data.distributed.DistributedSampler(dataset, num_replicas=4, rank=0)\ndataloader = torch.utils.data.DataLoader(dataset, batch_size=2, sampler=sampler)\n<\/code><\/pre>\n\n\n\n<p>\u52a8\u624b\u5b9e\u73b0\u4e00\u4e2a\u91c7\u6837\u5668\uff1a<\/p>\n\n\n\n<p><code>CustomDistributedBufferDynamicBatchSampler<\/code> \u662f\u4e00\u4e2a\u7528\u4e8e\u5206\u5e03\u5f0f\u8bad\u7ec3\u7684\u81ea\u5b9a\u4e49\u6570\u636e\u91c7\u6837\u5668\uff0c\u5b83\u7ed3\u5408\u4e86\u52a8\u6001\u6279\u91cf\u5927\u5c0f\u548c\u7f13\u51b2\u533a\u7684\u6392\u5e8f\u7b56\u7565\u3002\u5b83\u7684\u76ee\u7684\u662f\u901a\u8fc7\u66f4\u590d\u6742\u7684\u7b56\u7565\u6765\u751f\u6210\u6279\u91cf\uff0c\u4ee5\u9002\u5e94\u5404\u79cd\u8bad\u7ec3\u9700\u6c42\u3002\u4e0b\u9762\u662f\u5bf9\u8fd9\u4e2a\u91c7\u6837\u5668\u7684\u8be6\u7ec6\u89e3\u91ca\uff1a<\/p>\n\n\n\n<p><code>__iter__<\/code> \u65b9\u6cd5\u751f\u6210\u6570\u636e\u6279\u6b21\uff0c\u8003\u8651\u5230\u52a8\u6001\u6279\u91cf\u5927\u5c0f\u548c\u7f13\u51b2\u533a\u7684\u6392\u5e8f\uff1a<\/p>\n\n\n\n<p><strong>\u6570\u636e\u6253\u4e71<\/strong>\uff1a\u5982\u679c <code>shuffle<\/code> \u4e3a True\uff0c\u6570\u636e\u5c06\u88ab\u6253\u4e71\u3002<strong>\u7f13\u51b2\u533a\u6392\u5e8f<\/strong>\uff1a\u6570\u636e\u88ab\u5206\u6210\u591a\u4e2a\u7f13\u51b2\u533a\uff0c\u6bcf\u4e2a\u7f13\u51b2\u533a\u7684\u5927\u5c0f\u7531 <code>sort_size<\/code> \u63a7\u5236\uff0c\u5e76\u6309\u6837\u672c\u957f\u5ea6\u8fdb\u884c\u6392\u5e8f\u3002<strong>\u6279\u91cf\u751f\u6210<\/strong>\uff1a\u6839\u636e <code>batch_size<\/code> \u548c <code>batch_size_sample_max<\/code> \u751f\u6210\u6279\u91cf\u3002\u5982\u679c\u5f53\u524d\u7f13\u51b2\u533a\u4e2d\u7684\u6570\u636e\u65e0\u6cd5\u6ee1\u8db3\u6279\u6b21\u5927\u5c0f\uff0c\u5219\u5c06\u73b0\u6709\u6570\u636e\u4f5c\u4e3a\u4e00\u4e2a\u6279\u6b21\u3002<strong>\u6570\u636e\u91cd\u590d\u548c\u5206\u914d<\/strong>\uff1a\u786e\u4fdd\u6bcf\u4e2a\u8fdb\u7a0b\u83b7\u5f97\u76f8\u540c\u6570\u91cf\u7684\u6279\u6b21\u3002\u5982\u679c\u603b\u6279\u6b21\u4e0d\u8db3\u4ee5\u5747\u5206\uff0c\u91cd\u590d\u4e00\u4e9b\u6279\u6b21\u4ee5\u6ee1\u8db3\u6bcf\u4e2a\u8fdb\u7a0b\u7684\u9700\u6c42\u3002<\/p>\n\n\n\n<p><strong><code>dataset<\/code><\/strong>: \u6570\u636e\u96c6\u5b9e\u4f8b\u3002<strong><code>batch_size<\/code><\/strong>: \u6279\u6b21\u5927\u5c0f\u3002<strong><code>batch_type<\/code><\/strong>: \u6279\u6b21\u7684\u7c7b\u578b\uff08\u4f8b\u5982\u6309 token \u6216\u6837\u672c\uff09\u3002<strong><code>num_replicas<\/code><\/strong>: \u603b\u7684\u8fdb\u7a0b\u6570\u3002<strong><code>rank<\/code><\/strong>: \u5f53\u524d\u8fdb\u7a0b\u7684 rank\u3002<strong><code>rank_split<\/code><\/strong>: \u662f\u5426\u5206\u5272 rank\u3002<strong><code>shuffle<\/code><\/strong>: \u662f\u5426\u6253\u4e71\u6570\u636e\u3002<strong><code>drop_last<\/code><\/strong>: \u662f\u5426\u4e22\u5f03\u6700\u540e\u4e00\u4e2a\u6279\u6b21\u3002<strong><code>is_training<\/code><\/strong>: \u662f\u5426\u5904\u4e8e\u8bad\u7ec3\u6a21\u5f0f\u3002<strong><code>sort_size<\/code><\/strong>: \u7f13\u51b2\u533a\u7684\u5927\u5c0f\uff0c\u7528\u4e8e\u6392\u5e8f\u6570\u636e\u3002<strong><code>start_step<\/code><\/strong>: \u8d77\u59cb\u6b65\u6570\uff08\u7528\u4e8e\u4ece\u7279\u5b9a\u6b65\u6570\u5f00\u59cb\u8bad\u7ec3\uff09\u3002<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>def __init__(\n    self,\n    dataset,\n    batch_size,\n    batch_type=\"token\",\n    num_replicas=None,\n    rank=None,\n    rank_split=False,\n    shuffle=True,\n    drop_last=False,\n    is_training: bool = True,\n    sort_size: int = 1024,\n    start_step: int = 0,\n    **kwargs,\n):\n    try:\n        rank = dist.get_rank()\n        num_replicas = dist.get_world_size()\n    except:\n        rank = 0\n        num_replicas = 1\n\n    self.rank = rank\n    self.num_replicas = num_replicas\n    self.dataset = dataset\n    self.batch_size = batch_size\n    self.batch_type = batch_type\n    self.is_training = is_training\n    self.shuffle = shuffle and is_training\n    self.drop_last = drop_last\n\n    self.total_size = len(self.dataset)\n    self.num_samples = int(math.ceil(self.total_size \/ self.num_replicas))\n    self.epoch = 0\n    self.sort_size = sort_size * num_replicas\n    self.max_token_length = kwargs.get(\"max_token_length\", 2048)\n    self.length_scale_source = kwargs.get(\"length_scale_source\", 1.0)\n    self.batch_size_sample_max = kwargs.get(\"batch_size_sample_max\", 200)\n    self.start_step = start_step\n    self.batch_num = 1\n    if self.start_step &gt; 0:\n        logging.info(f\"Warning, start_step &gt; 0, dataloader start from step: {self.start_step}\")\n<\/code><\/pre>\n\n\n\n<pre class=\"wp-block-code\"><code>def __iter__(self):\n    if self.shuffle:\n        g = torch.Generator()\n        g.manual_seed(self.epoch)\n        random.seed(self.epoch)\n        indices = torch.randperm(len(self.dataset), generator=g).tolist()\n    else:\n        indices = list(range(len(self.dataset)))\n\n    # Create sorted buffers and form batches\n    buffer_batches = &#091;]\n    for i in range(0, len(indices), self.sort_size):\n        buffer = sorted(\n            indices&#091;i : i + self.sort_size], key=lambda idx: self.dataset.get_source_len(idx)\n        )\n        batch = &#091;]\n        max_len_in_batch = 0\n        count = 1\n        for idx in buffer:\n            original_sample_length = self.dataset.get_source_len(idx)\n            if original_sample_length &gt; self.max_token_length:\n                continue\n            sample_length = 1 if self.batch_type == \"example\" else original_sample_length\n            potential_batch_length = max(max_len_in_batch, sample_length) * (len(batch) + 1)\n            if potential_batch_length &lt;= self.batch_size and count &lt; self.batch_size_sample_max:\n                batch.append(idx)\n                max_len_in_batch = max(max_len_in_batch, sample_length)\n                count += 1\n            else:\n                buffer_batches.append(batch)\n                batch = &#091;idx]\n                max_len_in_batch = sample_length\n                count = 1\n        if batch:\n            buffer_batches.append(batch)\n\n    # Ensure each rank gets the same number of batches, duplicate data if needed\n    batches_per_rank = math.ceil(len(buffer_batches) \/ self.num_replicas)\n    total_batches_needed = batches_per_rank * self.num_replicas\n    extra_batches = total_batches_needed - len(buffer_batches)\n    buffer_batches += random.choices(buffer_batches, k=extra_batches)\n\n    # Evenly distribute batches from buffer_batches to each rank\n    rank_batches = &#091;&#091;] for _ in range(self.num_replicas)]\n    for i, batch in enumerate(buffer_batches):\n        rank_batches&#091;i % self.num_replicas].append(batch)\n\n    # Assign all batches for the current rank directly\n    final_batches = rank_batches&#091;self.rank]&#091;self.start_step :]\n    self.batch_num = len(final_batches)\n\n    logging.info(\n        f\"rank: {self.rank}, dataloader start from step: {self.start_step}, batch_num: {len(rank_batches&#091;self.rank])}, after: {self.batch_num}\"\n    )\n    return iter(final_batches)\n<\/code><\/pre>\n\n\n\n<p><code>CustomDistributedBufferDynamicBatchSampler<\/code> \u901a\u8fc7\u4ee5\u4e0b\u65b9\u5f0f\u589e\u5f3a\u4e86\u6570\u636e\u91c7\u6837\uff1a<\/p>\n\n\n\n<ul class=\"has-light-pink-background-color has-background\"><li><strong>\u52a8\u6001\u6279\u91cf\u5927\u5c0f<\/strong>\uff1a\u6839\u636e\u6570\u636e\u7684\u5b9e\u9645\u957f\u5ea6\u52a8\u6001\u8c03\u6574\u6279\u91cf\u5927\u5c0f\u3002<\/li><li><strong>\u7f13\u51b2\u533a\u6392\u5e8f<\/strong>\uff1a\u4f7f\u7528\u6392\u5e8f\u7f13\u51b2\u533a\u7b56\u7565\u63d0\u9ad8\u6570\u636e\u5904\u7406\u6548\u7387\u3002<\/li><li><strong>\u6570\u636e\u5747\u5300\u5206\u914d<\/strong>\uff1a<strong>\u786e\u4fdd\u6bcf\u4e2a\u8fdb\u7a0b\u83b7\u5f97\u76f8\u540c\u6570\u91cf\u7684\u6279\u6b21\uff0c\u907f\u514d\u6570\u636e\u4e0d\u5747\u8861\u3002<\/strong><\/li><\/ul>\n\n\n\n<p>\u8fd9\u4e9b\u7279\u6027\u4f7f\u5f97 <code>CustomDistributedBufferDynamicBatchSampler<\/code> \u80fd\u591f\u66f4\u597d\u5730\u5904\u7406\u5927\u89c4\u6a21\u6570\u636e\u96c6\uff0c\u5e76\u5728\u5206\u5e03\u5f0f\u8bad\u7ec3\u4e2d\u63d0\u4f9b\u9ad8\u6548\u7684\u6570\u636e\u52a0\u8f7d\u548c\u6279\u6b21\u751f\u6210\u7b56\u7565\u3002<\/p>\n\n\n\n<p class=\"has-light-blue-background-color has-background\"><strong>\u6570\u636e\u5747\u5300\u5206\u914d\u81f3\u5173\u91cd\u8981\uff1a<\/strong>\u5982\u679c\u5206\u914d\u4e0d\u5747\uff0c\u4f1a\u5bfc\u81f4\u67d0\u4e2a\u8282\u70b9\u7684GPU\u663e\u5b58\u7206\u70b8\uff0c\u5bfc\u81f4\u77ed\u7b52\u6548\u5e94\uff0c\u6240\u4ee5\u9700\u8981\u5bf9\u6570\u636e\u8fdb\u884c\u5e73\u5747\u5206\u914d\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"1021\" height=\"525\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-8.png\" alt=\"\" class=\"wp-image-16703\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-8.png 1021w, http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-8-300x154.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-8-768x395.png 768w\" sizes=\"(max-width: 1021px) 100vw, 1021px\" \/><\/figure>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"1024\" height=\"237\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-9-1024x237.png\" alt=\"\" class=\"wp-image-16704\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-9-1024x237.png 1024w, http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-9-300x69.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-9-768x177.png 768w, http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-9.png 1026w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<h3>\u5206\u5e03\u5f0f\u8bad\u7ec3\u7684\u65f6\u5019 \u5982\u4f55\u5b9a\u4e49\u81ea\u5df1\u7684samper\uff0c\u5982\u4f55\u4fdd\u8bc1\u4e0d\u540c\u7684\u8282\u70b9\u4f7f\u7528\u4e0d\u540c\u7684\u6570\u636e\u8bad\u7ec3\uff1f<\/h3>\n\n\n\n<p>\u6839\u636erank\u6570\u91cf\u5c06\u7d22\u5f15\u5206\u6210\u4e0d\u540c\u7684rank\u4efd\u3002 <code>\u5206\u5272\u6570\u636e\u4ee5\u786e\u4fdd\u6bcf\u4e2a\u8fdb\u7a0b\u83b7\u53d6\u4e0d\u540c\u7684\u7d22\u5f15<\/code> \u3002<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>        if self.num_replicas is not None and self.rank is not None:\n            # \u6bcf\u4e2a\u8fdb\u7a0b\u5904\u7406\u7684\u6570\u636e\u7d22\u5f15\u8303\u56f4\n            num_samples = int(np.ceil(len(indices) \/ self.num_replicas))\n            start = self.rank * num_samples\n            end = min(start + num_samples, len(indices))\n            indices = indices&#091;start:end]<\/code><\/pre>\n\n\n\n<h4>1. \u5b9a\u4e49\u81ea\u5b9a\u4e49Sampler<\/h4>\n\n\n\n<p>\u81ea\u5b9a\u4e49<code>Sampler<\/code>\u9700\u8981\u7ee7\u627f<code>torch.utils.data.Sampler<\/code>\u5e76\u5b9e\u73b0<code>__iter__<\/code>\u65b9\u6cd5\uff0c\u8fd4\u56de\u6570\u636e\u7d22\u5f15\u7684\u8fed\u4ee3\u5668\u3002\u4ee5\u4e0b\u662f\u4e00\u4e2a\u7b80\u5355\u7684\u793a\u4f8b\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-preformatted\">python\u590d\u5236\u4ee3\u7801<code>import torch\nimport numpy as np\n\nclass CustomSampler(torch.utils.data.Sampler):\n    def __init__(self, data_source, num_replicas=None, rank=None):\n        self.data_source = data_source\n        self.num_replicas = num_replicas\n        self.rank = rank\n\n    def __iter__(self):\n        # \u83b7\u53d6\u6240\u6709\u6837\u672c\u7d22\u5f15\n        indices = np.arange(len(self.data_source))\n\n        # \u5206\u5272\u6570\u636e\u4ee5\u786e\u4fdd\u6bcf\u4e2a\u8fdb\u7a0b\u83b7\u53d6\u4e0d\u540c\u7684\u7d22\u5f15\n        if self.num_replicas is not None and self.rank is not None:\n            # \u6bcf\u4e2a\u8fdb\u7a0b\u5904\u7406\u7684\u6570\u636e\u7d22\u5f15\u8303\u56f4\n            num_samples = int(np.ceil(len(indices) \/ self.num_replicas))\n            start = self.rank * num_samples\n            end = min(start + num_samples, len(indices))\n            indices = indices[start:end]\n\n        # \u6253\u4e71\u6570\u636e\n        np.random.shuffle(indices)\n        return iter(indices)\n\n    def __len__(self):\n        if self.num_replicas is not None and self.rank is not None:\n            num_samples = int(np.ceil(len(self.data_source) \/ self.num_replicas))\n            return num_samples\n        return len(self.data_source)\n<\/code><\/pre>\n\n\n\n<h4>2. \u521d\u59cb\u5316\u5206\u5e03\u5f0f\u73af\u5883<\/h4>\n\n\n\n<p>\u5728\u8bad\u7ec3\u811a\u672c\u4e2d\uff0c\u521d\u59cb\u5316\u5206\u5e03\u5f0f\u73af\u5883\u5e76\u521b\u5efa\u81ea\u5b9a\u4e49\u91c7\u6837\u5668\u3002<\/p>\n\n\n\n<pre class=\"wp-block-preformatted\">python\u590d\u5236\u4ee3\u7801<code>import torch\nimport torch.distributed as dist\n\ndist.init_process_group(backend='nccl')  # \u6216 'gloo'\nlocal_rank = dist.get_rank()\nworld_size = dist.get_world_size()\n\n# \u6570\u636e\u96c6\nfrom torchvision import datasets, transforms\ntransform = transforms.Compose([transforms.ToTensor()])\ndataset = datasets.CIFAR10(root='.\/data', train=True, transform=transform, download=True)\n\n# \u521b\u5efa\u81ea\u5b9a\u4e49\u91c7\u6837\u5668\nsampler = CustomSampler(dataset, num_replicas=world_size, rank=local_rank)\n\n# \u521b\u5efa\u6570\u636e\u52a0\u8f7d\u5668\ndataloader = torch.utils.data.DataLoader(dataset, batch_size=32, sampler=sampler)\n<\/code><\/pre>\n\n\n\n<h4>3. \u5728\u8bad\u7ec3\u65f6\u8bbe\u7f6e\u91c7\u6837\u5668\u7684epoch<\/h4>\n\n\n\n<p>\u5982\u679c\u4f60\u7684\u81ea\u5b9a\u4e49<code>Sampler<\/code>\u9700\u8981\u5728\u6bcf\u4e2aepoch\u4e2d\u66f4\u6539\u6570\u636e\u987a\u5e8f\uff0c\u53ef\u4ee5\u5728\u6bcf\u4e2aepoch\u5f00\u59cb\u65f6\u8c03\u7528<code>sampler.set_epoch(epoch)<\/code>\u3002<\/p>\n\n\n\n<pre class=\"wp-block-preformatted\">python\u590d\u5236\u4ee3\u7801<code>for epoch in range(num_epochs):\n    sampler.set_epoch(epoch)  # \u5982\u679c\u4f60\u7684\u81ea\u5b9a\u4e49Sampler\u652f\u6301\u8fd9\u4e2a\u65b9\u6cd5\n    for batch in dataloader:\n        # \u8bad\u7ec3\u4ee3\u7801\n<\/code><\/pre>\n\n\n\n<p>\u8fd9\u6837\uff0c\u4f60\u5c31\u53ef\u4ee5\u5b9a\u4e49\u4e00\u4e2a\u9002\u5408\u4f60\u9700\u6c42\u7684\u81ea\u5b9a\u4e49<code>Sampler<\/code>\uff0c\u5e76\u5728\u5206\u5e03\u5f0f\u8bad\u7ec3\u4e2d\u4f7f\u7528\u5b83\u3002<\/p>\n\n\n\n<h3>DDP\u5206\u5e03\u5f0f\u8bad\u7ec3\u65f6\u5019 batchsize\u8bbe\u7f6e\u662f\u6307\u5355\u5361\u8fd8\u591a\u5361\u6240\u6709\u7684\u603bbatch\uff1f<\/h3>\n\n\n\n<p class=\"has-light-pink-background-color has-background\"><strong>\u5728\u5206\u5e03\u5f0f\u6570\u636e\u5e76\u884c\uff08DDP\uff09\u8bad\u7ec3\u4e2d\uff0c<code>batch_size<\/code>\u7684\u8bbe\u7f6e\u662f\u6307\u6bcf\u4e2a\u5355\u5361\uff08\u5373\u6bcf\u4e2aGPU\uff09\u7684batch size\u3002\u603b\u7684batch size\u662f\u6bcf\u4e2a\u5355\u5361\u7684batch size\u4e58\u4ee5GPU\u7684\u6570\u91cf\u3002\u3010\u5728samper\u91c7\u6837\u7684\u65f6\u5019\uff0c\u6839\u636erank\u6570\u91cf\uff0c\u5c06index \u5206\u5272\u6210 rank\u4efd\uff0c\u6bcf\u4e00\u4efd\u91cc\u9762\u8fdb\u884cbatchsize\u7684\u91c7\u6837\uff0c\u6240\u4ee5bs\u6307\u7684\u662f\u5355\u4e2aGPU\u7684bs\u3011<\/strong><\/p>\n\n\n\n<p>\u4f8b\u5982\uff0c\u5982\u679c\u4f60\u67094\u4e2aGPU\uff0c\u5e76\u4e14\u6bcf\u4e2aGPU\u7684batch size\u8bbe\u7f6e\u4e3a32\uff0c\u90a3\u4e48\u603b\u7684batch size\u5c31\u662f32 * 4 = 128\u3002\u6bcf\u4e2aGPU\u5728\u6bcf\u6b21\u8bad\u7ec3\u8fed\u4ee3\u4e2d\u5904\u740632\u4e2a\u6837\u672c\uff0c\u6240\u67094\u4e2aGPU\u5728\u6bcf\u6b21\u8bad\u7ec3\u8fed\u4ee3\u4e2d\u5904\u7406\u603b\u5171128\u4e2a\u6837\u672c\u3002<\/p>\n\n\n\n<p>\u5982\u679c\u4f60\u4f7f\u7528\u7684\u662f\u5206\u5e03\u5f0f\u6570\u636e\u5e76\u884c\u7684\u8bad\u7ec3\u7b56\u7565\uff0c\u786e\u4fdd\u5c06<code>batch_size<\/code>\u8bbe\u7f6e\u4e3a\u6bcf\u4e2aGPU\u4e0a\u5e0c\u671b\u7684\u5927\u5c0f\uff0c\u800c\u4e0d\u662f\u603b\u7684batch size\u3002<\/p>\n\n\n\n<h3>datalaoder\u4e2d\u8bbe\u7f6e\u7684 number_work\u5728DDP\u8bad\u7ec3\u4e2d\u5982\u4f55\u5de5\u4f5c\u7684\uff1f<\/h3>\n\n\n\n<p class=\"has-light-pink-background-color has-background\"><strong>\u9996\u5148\u660e\u786e\u4e00\u70b9\uff1a num_works\u6307\u7684\u662f\u5355\u4e2aGPU\u7684num_works\u6570\u636e\u52a0\u8f7d\u8fdb\u7a0b\u6570\u91cf\u3002<\/strong><\/p>\n\n\n\n<ul><li>**<code>num_workers<\/code>**\u53c2\u6570\u5b9a\u4e49\u4e86\u5e76\u884c\u6570\u636e\u52a0\u8f7d\u7684\u8fdb\u7a0b\u6570\u91cf\u3002\u6bcf\u4e2a\u8fdb\u7a0b\u72ec\u7acb\u5730\u4ece\u6570\u636e\u96c6\u4e2d\u8bfb\u53d6\u548c\u9884\u5904\u7406\u6570\u636e\u3002<\/li><li>**<code>collate_fn<\/code>**\u53ef\u4ee5\u81ea\u5b9a\u4e49\u5982\u4f55\u5c06\u6570\u636e\u9879\u7ec4\u5408\u6210batch\u3002<\/li><li>\u6570\u636e\u52a0\u8f7d\u8fdb\u7a0b\u5c06\u9884\u5904\u7406\u540e\u7684\u6570\u636e\u6279\u6b21\u4f20\u9012\u7ed9\u4e3b\u8fdb\u7a0b\uff0c\u4e3b\u8fdb\u7a0b\u5c06\u8fd9\u4e9b\u6279\u6b21\u6570\u636e\u9001\u5165\u6a21\u578b\u8fdb\u884c\u8bad\u7ec3\u3002<\/li><\/ul>\n\n\n\n<p>\u4f7f\u7528\u591a\u4e2a\u6570\u636e\u52a0\u8f7d\u8fdb\u7a0b\u53ef\u4ee5\u63d0\u9ad8\u6570\u636e\u9884\u5904\u7406\u7684\u901f\u5ea6\uff0c\u51cf\u5c11GPU\u5728\u8bad\u7ec3\u65f6\u7684\u7b49\u5f85\u65f6\u95f4\uff0c\u4ece\u800c\u52a0\u5feb\u6574\u4f53\u8bad\u7ec3\u8fc7\u7a0b\u3002<\/p>\n\n\n\n<h4><code>num_workers<\/code>\u7684\u4f5c\u7528<\/h4>\n\n\n\n<ul><li><strong>\u6570\u636e\u52a0\u8f7d<\/strong>: <code>num_workers<\/code>\u51b3\u5b9a\u4e86\u7528\u4e8e\u52a0\u8f7d\u6570\u636e\u7684\u5b50\u8fdb\u7a0b\u7684\u6570\u91cf\u3002\u66f4\u591a\u7684\u5de5\u4f5c\u8fdb\u7a0b\u53ef\u4ee5\u5e76\u884c\u5730\u8bfb\u53d6\u548c\u9884\u5904\u7406\u6570\u636e\uff0c\u4ece\u800c\u52a0\u5feb\u6570\u636e\u52a0\u8f7d\u901f\u5ea6\uff0c\u51cf\u5c11GPU\u7684\u7b49\u5f85\u65f6\u95f4\u3002<\/li><li><strong>\u6027\u80fd\u5f71\u54cd<\/strong>: \u589e\u52a0<code>num_workers<\/code>\u7684\u6570\u91cf\u901a\u5e38\u53ef\u4ee5\u63d0\u9ad8\u6570\u636e\u52a0\u8f7d\u901f\u5ea6\uff0c\u4f46\u4e5f\u4f1a\u589e\u52a0\u7cfb\u7edf\u7684\u5185\u5b58\u4f7f\u7528\u3002\u5408\u7406\u8bbe\u7f6e<code>num_workers<\/code>\u7684\u503c\u53ef\u4ee5\u5728\u6570\u636e\u52a0\u8f7d\u6548\u7387\u548c\u7cfb\u7edf\u8d44\u6e90\u4f7f\u7528\u4e4b\u95f4\u627e\u5230\u5e73\u8861\u3002<\/li><\/ul>\n\n\n\n<h4>\u5728DDP\u8bad\u7ec3\u4e2d\u7684\u8003\u8651<\/h4>\n\n\n\n<ol class=\"has-light-pink-background-color has-background\"><li><strong>\u6bcf\u4e2a\u8fdb\u7a0b\u7684<code>num_workers<\/code><\/strong>: \u6bcf\u4e2a\u5206\u5e03\u5f0f\u8fdb\u7a0b\uff08\u5373\u6bcf\u4e2aGPU\uff09\u90fd\u6709\u81ea\u5df1\u7684\u6570\u636e\u52a0\u8f7d\u5b50\u8fdb\u7a0b\u3002<strong>\u8fd9\u610f\u5473\u7740\u603b\u7684<code>num_workers<\/code>\u4f1a\u662f\u6bcf\u4e2aGPU\u4e0a<code>num_workers<\/code>\u7684\u503c\u4e58\u4ee5GPU\u7684\u6570\u91cf\uff08\u5206\u5e03\u5f0f\u8fdb\u7a0b\u6570\uff09\u3002\u4f8b\u5982\uff0c\u5982\u679c\u67094\u4e2aGPU\uff0c\u5e76\u4e14\u6bcf\u4e2aGPU\u7684<code>num_workers<\/code>\u8bbe\u7f6e\u4e3a4\uff0c\u90a3\u4e48\u603b\u7684\u5de5\u4f5c\u8fdb\u7a0b\u6570\u5c06\u662f4 * 4 = 16\u3002<\/strong><\/li><li><strong>\u907f\u514d\u6570\u636e\u91cd\u53e0<\/strong>: \u5728\u5206\u5e03\u5f0f\u8bad\u7ec3\u4e2d\uff0c\u9700\u8981\u786e\u4fdd\u6bcf\u4e2a\u8fdb\u7a0b\u5904\u7406\u7684\u6570\u636e\u5b50\u96c6\u662f\u4e0d\u540c\u7684\u3002\u4f7f\u7528<code>DistributedSampler<\/code>\u53ef\u4ee5\u786e\u4fdd\u6570\u636e\u5728\u5404\u4e2a\u8fdb\u7a0b\u95f4\u5747\u5300\u5206\u914d\uff0c\u4ece\u800c\u907f\u514d\u6570\u636e\u91cd\u590d\u548c\u4e22\u5931\u3002<\/li><li><strong>\u540c\u6b65\u548c\u901a\u4fe1\u5f00\u9500<\/strong>: <strong>\u589e\u52a0<code>num_workers<\/code>\u7684\u6570\u91cf\u53ef\u80fd\u4f1a\u589e\u52a0\u8fdb\u7a0b\u95f4\u7684\u540c\u6b65\u548c\u901a\u4fe1\u5f00\u9500\uff0c\u7279\u522b\u662f\u5728\u591aGPU\u7684\u60c5\u51b5\u4e0b\u3002\u9700\u8981\u6839\u636e\u5177\u4f53\u7684\u786c\u4ef6\u914d\u7f6e\u548c\u6570\u636e\u96c6\u5927\u5c0f\u6765\u8c03\u6574<code>num_workers<\/code>\u3002<\/strong><\/li><li><strong>\u5185\u5b58\u548cCPU\u8d44\u6e90<\/strong>: <strong>\u6bcf\u589e\u52a0\u4e00\u4e2a\u5de5\u4f5c\u8fdb\u7a0b\uff0c\u90fd\u4f1a\u6d88\u8017\u989d\u5916\u7684CPU\u8d44\u6e90\u548c\u5185\u5b58\u3002\u786e\u4fdd\u4f60\u7684\u7cfb\u7edf\u6709\u8db3\u591f\u7684\u8d44\u6e90\u6765\u652f\u6301\u8bbe\u7f6e\u7684<code>num_workers<\/code>\u503c\u3002<\/strong><\/li><\/ol>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"1024\" height=\"116\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-11-1024x116.png\" alt=\"\" class=\"wp-image-16776\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-11-1024x116.png 1024w, http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-11-300x34.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-11-768x87.png 768w, http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-11.png 1249w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<h3>\u5355\u4e2anumberwork\u5b50\u8fdb\u7a0b\u5355\u72ec\u8d1f\u8d23\u4e00\u4e2abatch\u7684\u6570\u636e\u5417\uff0c\u7136\u540e\u591a\u4e2a\u8fdb\u7a0b\u8d1f\u8d23\u52a0\u8f7d\u591a\u4e2a\u4e0d\u540cbatch\u6570\u636e\uff1f<\/h3>\n\n\n\n<p class=\"has-light-pink-background-color has-background\"><strong>\u5728PyTorch\u4e2d\uff0c<code>DataLoader<\/code>\u4e2d\u7684\u5b50\u8fdb\u7a0b\uff08\u7531<code>num_workers<\/code>\u53c2\u6570\u6307\u5b9a\uff09\u5e76\u4e0d\u4e00\u5b9a\u662f\u6bcf\u4e2a\u8fdb\u7a0b\u8d1f\u8d23\u4e00\u4e2a\u5b8c\u6574\u7684batch\u7684\u6570\u636e\u3002\u5b9e\u9645\u64cd\u4f5c\u4e2d\uff0c\u591a\u4e2a\u5b50\u8fdb\u7a0b\u8d1f\u8d23\u5e76\u884c\u5730\u9884\u5904\u7406\u6570\u636e\u5e76\u5c06\u5176\u9001\u5165\u4e3b\u8fdb\u7a0b\u3002\u4e0b\u9762\u662f\u8be6\u7ec6\u7684\u89e3\u91ca\uff1a<\/strong><\/p>\n\n\n\n<h4>\u6570\u636e\u52a0\u8f7d\u8fdb\u7a0b\u7684\u5de5\u4f5c\u65b9\u5f0f<\/h4>\n\n\n\n<ol><li><strong>\u5b50\u8fdb\u7a0b\u7684\u4efb\u52a1<\/strong>: \u6bcf\u4e2a\u6570\u636e\u52a0\u8f7d\u8fdb\u7a0b\u4ece\u6570\u636e\u96c6\u4e2d\u63d0\u53d6\u6837\u672c\uff0c\u5e76\u6267\u884c\u9884\u5904\u7406\u4efb\u52a1\u3002\u5b50\u8fdb\u7a0b\u4f1a\u4ece\u6570\u636e\u96c6\u4e2d\u8bfb\u53d6\u5355\u4e2a\u6837\u672c\uff08\u6216\u591a\u4e2a\u6837\u672c\uff09\uff0c\u8fdb\u884c\u5fc5\u8981\u7684\u8f6c\u6362\u548c\u9884\u5904\u7406\uff0c\u7136\u540e\u5c06\u8fd9\u4e9b\u5904\u7406\u540e\u7684\u6837\u672c\u8fd4\u56de\u7ed9\u4e3b\u8fdb\u7a0b\u3002<\/li><li><strong>\u6279\u6b21\u7684\u751f\u6210<\/strong>: <code>DataLoader<\/code>\u5728\u4e3b\u8fdb\u7a0b\u4e2d\u5904\u7406\u6279\u6b21\u7684\u751f\u6210\u3002\u4e3b\u8fdb\u7a0b\u8d1f\u8d23\u5c06\u4ece\u5404\u4e2a\u5b50\u8fdb\u7a0b\u4e2d\u63a5\u6536\u5230\u7684\u6837\u672c\u805a\u5408\u6210\u4e00\u4e2a\u5b8c\u6574\u7684batch\u3002\u5177\u4f53\u6765\u8bf4\uff0c\u5f53\u4e3b\u8fdb\u7a0b\u9700\u8981\u4e00\u4e2a\u65b0\u7684batch\u65f6\uff0c\u5b83\u4f1a\u4ece\u5b50\u8fdb\u7a0b\u4e2d\u83b7\u53d6\u6837\u672c\uff0c\u7136\u540e\u4f7f\u7528<code>collate_fn<\/code>\u5c06\u8fd9\u4e9b\u6837\u672c\u7ec4\u5408\u6210\u4e00\u4e2abatch\u3002<\/li><li><strong>\u6570\u636e\u5e76\u884c\u5904\u7406<\/strong>: \u5982\u679c\u8bbe\u7f6e\u4e86<code>num_workers &gt; 1<\/code>\uff0c<code>DataLoader<\/code>\u4f1a\u542f\u52a8\u591a\u4e2a\u5b50\u8fdb\u7a0b\u6765\u5e76\u884c\u5730\u52a0\u8f7d\u6570\u636e\u3002\u8fd9\u4e9b\u5b50\u8fdb\u7a0b\u5e76\u4e0d\u72ec\u7acb\u5904\u7406\u5b8c\u6574\u7684batch\uff0c\u800c\u662f\u5e76\u884c\u5730\u4ece\u6570\u636e\u96c6\u4e2d\u63d0\u53d6\u548c\u9884\u5904\u7406\u6837\u672c\u3002\u4e3b\u8fdb\u7a0b\u4f1a\u4ece\u8fd9\u4e9b\u5b50\u8fdb\u7a0b\u4e2d\u6536\u96c6\u6837\u672c\uff0c\u5e76\u5728\u4e3b\u8fdb\u7a0b\u4e2d\u5c06\u6837\u672c\u7ec4\u6210batch\u3002<\/li><\/ol>\n\n\n\n<h4>\u6570\u636e\u52a0\u8f7d\u793a\u4f8b<\/h4>\n\n\n\n<p>\u5047\u8bbe\u4f60\u8bbe\u7f6e\u4e86<code>num_workers=4<\/code>\uff0c\u8fd9\u610f\u5473\u7740\u4f1a\u67094\u4e2a\u5b50\u8fdb\u7a0b\u5e76\u884c\u5730\u5904\u7406\u6570\u636e\u3002\u6570\u636e\u52a0\u8f7d\u7684\u8fc7\u7a0b\u5927\u81f4\u5982\u4e0b\uff1a<\/p>\n\n\n\n<ol><li><strong>\u5b50\u8fdb\u7a0b\u8bfb\u53d6\u6570\u636e<\/strong>:<ul><li>\u6bcf\u4e2a\u5b50\u8fdb\u7a0b\u4ece\u6570\u636e\u96c6\u4e2d\u8bfb\u53d6\u6837\u672c\u3002<\/li><li>\u5b50\u8fdb\u7a0b\u5bf9\u6837\u672c\u8fdb\u884c\u9884\u5904\u7406\uff08\u5982\u6570\u636e\u589e\u5f3a\u3001\u6807\u51c6\u5316\u7b49\uff09\u3002<\/li><\/ul><\/li><li><strong>\u4e3b\u8fdb\u7a0b\u6536\u96c6\u6570\u636e<\/strong>:<ul><li>\u4e3b\u8fdb\u7a0b\u4ece\u5b50\u8fdb\u7a0b\u4e2d\u6536\u96c6\u5904\u7406\u540e\u7684\u6837\u672c\u3002<\/li><li>\u4e3b\u8fdb\u7a0b\u4f7f\u7528<code>collate_fn<\/code>\u5c06\u6837\u672c\u7ec4\u5408\u6210\u4e00\u4e2abatch\u3002<\/li><\/ul><\/li><li><strong>\u751f\u6210\u6279\u6b21<\/strong>:<ul><li>\u4e3b\u8fdb\u7a0b\u5c06\u751f\u6210\u7684batch\u4f20\u9012\u7ed9\u8bad\u7ec3\u5faa\u73af\u3002<\/li><\/ul><\/li><\/ol>\n\n\n\n<h3>\u4e3a\u4ec0\u4e48numberwork\u8bbe\u7f6e\u5927\u4e86\u4f1a\u589e\u52a0CPU\u5185\u5b58\uff1f<\/h3>\n\n\n\n<p>\u8bbe\u7f6e\u8f83\u5927\u7684 <code>num_workers<\/code> \u503c\u4f1a\u589e\u52a0 CPU \u5185\u5b58\u4f7f\u7528\u7684\u539f\u56e0\u6709\u51e0\u4e2a\u65b9\u9762\uff1a<\/p>\n\n\n\n<h4>1. <strong>\u8fdb\u7a0b\u6570\u91cf\u548c\u5185\u5b58\u5360\u7528<\/strong><\/h4>\n\n\n\n<ul><li><strong>\u6bcf\u4e2a\u5b50\u8fdb\u7a0b\u7684\u5185\u5b58\u6d88\u8017<\/strong>: \u6bcf\u4e2a\u6570\u636e\u52a0\u8f7d\u5b50\u8fdb\u7a0b\uff08\u7531 <code>num_workers<\/code> \u5b9a\u4e49\uff09\u90fd\u4f1a\u72ec\u7acb\u5730\u8fd0\u884c\uff0c\u5e76\u52a0\u8f7d\u4e00\u90e8\u5206\u6570\u636e\u96c6\u3002\u6bcf\u4e2a\u5b50\u8fdb\u7a0b\u4f1a\u4f7f\u7528\u81ea\u5df1\u7684\u5185\u5b58\u6765\u5b58\u50a8\u6570\u636e\u548c\u8fdb\u884c\u9884\u5904\u7406\u64cd\u4f5c\u3002<\/li><li><strong>\u5185\u5b58\u9700\u6c42<\/strong>: \u5982\u679c <code>num_workers<\/code> \u8bbe\u7f6e\u5f97\u5f88\u9ad8\uff0c\u7cfb\u7edf\u5c06\u4f1a\u542f\u52a8\u591a\u4e2a\u5b50\u8fdb\u7a0b\uff0c\u8fd9\u4e9b\u8fdb\u7a0b\u4f1a\u540c\u65f6\u5b58\u5728\u5e76\u5360\u7528\u5185\u5b58\u3002\u6bcf\u4e2a\u8fdb\u7a0b\u90fd\u9700\u8981\u4e00\u5b9a\u7684\u5185\u5b58\u6765\u5b58\u50a8\u6570\u636e\u548c\u8fd0\u884c\u9884\u5904\u7406\u4ee3\u7801\uff0c\u4ece\u800c\u5bfc\u81f4\u603b\u7684\u5185\u5b58\u4f7f\u7528\u589e\u52a0\u3002<\/li><\/ul>\n\n\n\n<h4>2. <strong>\u6570\u636e\u9884\u5904\u7406\u548c\u7f13\u5b58<\/strong><\/h4>\n\n\n\n<ul><li><strong>\u6570\u636e\u7f13\u51b2<\/strong>: <code>DataLoader<\/code> \u4f7f\u7528\u5b50\u8fdb\u7a0b\u6765\u5e76\u884c\u52a0\u8f7d\u548c\u9884\u5904\u7406\u6570\u636e\u3002\u5728\u9884\u5904\u7406\u8fc7\u7a0b\u4e2d\uff0c\u5b50\u8fdb\u7a0b\u53ef\u80fd\u4f1a\u521b\u5efa\u548c\u7ef4\u62a4\u7f13\u5b58\uff0c\u8fd9\u4e9b\u7f13\u5b58\u53ef\u80fd\u4f1a\u6d88\u8017\u989d\u5916\u7684\u5185\u5b58\u3002<\/li><li><strong>\u6570\u636e\u52a0\u8f7d<\/strong>: \u8fdb\u7a0b\u5728\u6570\u636e\u52a0\u8f7d\u8fc7\u7a0b\u4e2d\u53ef\u80fd\u4f1a\u5728\u5185\u5b58\u4e2d\u4fdd\u6301\u4e00\u5b9a\u91cf\u7684\u6570\u636e\uff0c\u4ee5\u63d0\u9ad8\u6570\u636e\u5904\u7406\u6548\u7387\u3002\u8fd9\u79cd\u5185\u5b58\u7684\u5360\u7528\u4e5f\u4f1a\u968f\u7740 <code>num_workers<\/code> \u7684\u589e\u52a0\u800c\u589e\u52a0\u3002<\/li><\/ul>\n\n\n\n<h4>3. <strong>\u5e76\u53d1\u5904\u7406<\/strong><\/h4>\n\n\n\n<ul><li><strong>\u5e76\u53d1\u5f00\u9500<\/strong>: \u542f\u52a8\u5927\u91cf\u7684\u5b50\u8fdb\u7a0b\u8fdb\u884c\u6570\u636e\u5904\u7406\u4f1a\u589e\u52a0\u7cfb\u7edf\u7684\u5e76\u53d1\u5f00\u9500\u3002\u64cd\u4f5c\u7cfb\u7edf\u9700\u8981\u4e3a\u6bcf\u4e2a\u8fdb\u7a0b\u5206\u914d\u5185\u5b58\u548c\u7ba1\u7406\u8d44\u6e90\uff0c\u8fd9\u4f1a\u5bfc\u81f4\u7cfb\u7edf\u6574\u4f53\u7684\u5185\u5b58\u4f7f\u7528\u589e\u52a0\u3002<\/li><li><strong>\u8fdb\u7a0b\u95f4\u901a\u4fe1<\/strong>: \u591a\u4e2a\u5b50\u8fdb\u7a0b\u4e4b\u95f4\u53ef\u80fd\u4f1a\u6709\u6570\u636e\u4ea4\u6362\u548c\u540c\u6b65\u64cd\u4f5c\uff0c\u8fd9\u4e9b\u64cd\u4f5c\u4e5f\u53ef\u80fd\u589e\u52a0\u5185\u5b58\u5f00\u9500\u3002<\/li><\/ul>\n\n\n\n<h2>\u5927\u6a21\u578b\u8bad\u7ec3\u4e2d\u7684\u6570\u636e\u52a0\u8f7d\u548cNCCL\u901a\u4fe1\u95ee\u9898<\/h2>\n\n\n\n<h3>A\u3001\u8bad\u7ec3\u5927\u6a21\u578b\u65f6\u5019\uff0c\u6709\u4e24\u4ebf\u7684\u6570\u636e\uff0c\u6570\u636e\u7d22\u5f15\u4fdd\u5b58\u5230\u4e86jsonl\u6587\u4ef6\u4e2d\uff0c\u5728torch dataloader \u52a0\u8f7d\u6570\u636ejsonl\u6587\u4ef6\u65f6\u5019\u7206\u5185\u5b58\uff0c\u5982\u4f55\u89e3\u51b3<\/h3>\n\n\n\n<h3>1. <strong>\u4f7f\u7528\u5206\u5757\u52a0\u8f7d\uff08Chunk Loading\uff09<\/strong>\u3010\u6cd51\u3011<\/h3>\n\n\n\n<p>\u5c06\u6570\u636e\u5206\u5757\u5904\u7406\uff0c\u800c\u4e0d\u662f\u4e00\u6b21\u6027\u52a0\u8f7d\u6240\u6709\u6570\u636e\u3002\u53ef\u4ee5\u5728<code>Dataset<\/code>\u7c7b\u4e2d\u5b9e\u73b0\u8fd9\u4e00\u70b9\u3002\u793a\u4f8b\u4ee3\u7801\u5982\u4e0b\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>import json\nimport torch\nfrom torch.utils.data import Dataset, DataLoader\n\nclass LargeJSONLDataset(Dataset):\n    def __init__(self, jsonl_file, chunk_size=1000):\n        self.jsonl_file = jsonl_file\n        self.chunk_size = chunk_size\n        self.data = &#091;]\n        self._load_chunk(0)\n\n    def _load_chunk(self, chunk_index):\n        start_line = chunk_index * self.chunk_size\n        end_line = start_line + self.chunk_size\n        self.data = &#091;]\n        with open(self.jsonl_file, 'r') as f:\n            for i, line in enumerate(f):\n                if start_line &lt;= i &lt; end_line:\n                    self.data.append(json.loads(line))\n                if i &gt;= end_line:\n                    break\n\n    def __len__(self):\n        with open(self.jsonl_file, 'r') as f:\n            return sum(1 for _ in f)\n\n    def __getitem__(self, idx):\n        chunk_index = idx \/\/ self.chunk_size\n        self._load_chunk(chunk_index)\n        local_idx = idx % self.chunk_size\n        return self.data&#091;local_idx]\n\n# \u521b\u5efa Dataset \u548c DataLoader\ndataset = LargeJSONLDataset('data.jsonl')\ndataloader = DataLoader(dataset, batch_size=32, shuffle=True)\n<\/code><\/pre>\n\n\n\n<p class=\"has-light-pink-background-color has-background\">\u5b9e\u73b0\u7684\u903b\u8f91\uff1a\u91c7\u6837\u5668sampler \u83b7\u53d6 index =  len(self.dataset),\u7136\u540e\u8fdb\u884cindex\u968f\u673a\u62bd\u6837\uff0c\u5c06\u62bd\u5230\u7684id\u9001\u7ed9dataloader\u52a0\u8f7d\u5668\uff0cdataloader\u6839\u636e\u8fd9\u4e9bid\uff0c\u53bbdataset\u7c7b\u91cc\u9762\u6267\u884cgetitem\u3002  dataset \u4e0d\u5728\u9700\u8981\u52a0\u8f7d\u6240\u6709\u7684jsonl\u6587\u4ef6\uff0c\u53ea\u9700\u8981\u6839\u636eid\/\/self.chunk_size\u5224\u65ad\u6570\u636e\u5728\u7b2c\u51e0\u4e2achunk\uff0c\u7136\u540e\u5bf9\u5e94\u9700\u8981\u52a0\u8f7d\u76ee\u6807chunk\u7684\u6570\u636e\u5373\u53ef\uff0c\u7136\u540e\u5728id%  self.chunk_size \u5f97\u5230\u5728\u8be5chunk\u7684\u771f\u5b9eid\uff0c\u8bfb\u53d6\u3002\u8fd9\u6837\u505a\u7f3a\u70b9\u662f\u6bcf\u6b21\u90fd\u9700\u8981\u91cd\u65b0laod jsonl\u6587\u4ef6\uff0c\u52a0\u8f7d\u65f6\u95f4\u53d8\u6162\u3002<\/p>\n\n\n\n<h3>2. <strong>\u4f7f\u7528\u5185\u5b58\u6620\u5c04<\/strong><\/h3>\n\n\n\n<p>\u5185\u5b58\u6620\u5c04\u53ef\u4ee5\u5e2e\u52a9\u5c06\u5927\u6587\u4ef6\u6620\u5c04\u5230\u5185\u5b58\u4e2d\u800c\u4e0d\u662f\u5b8c\u5168\u52a0\u8f7d\u3002<code>jsonl<\/code>\u683c\u5f0f\u901a\u5e38\u4e0d\u652f\u6301\u76f4\u63a5\u5185\u5b58\u6620\u5c04\uff0c\u4f46\u53ef\u4ee5\u4f7f\u7528\u5206\u5757\u5904\u7406\u4e0e\u5185\u5b58\u6620\u5c04\u7ed3\u5408\u7684\u65b9\u6cd5\u3002<\/p>\n\n\n\n<p class=\"has-light-pink-background-color has-background\"><strong>\u5185\u5b58\u6620\u5c04\u662f\u4e00\u79cd\u5c06\u78c1\u76d8\u4e0a\u7684\u6587\u4ef6\u6620\u5c04\u5230\u5185\u5b58\u4e2d\u7684\u65b9\u6cd5\u3002\u901a\u8fc7\u4f7f\u7528\u5185\u5b58\u6620\u5c04\uff0c\u6211\u4eec\u53ef\u4ee5\u5728\u4e0d\u5c06\u6574\u4e2a\u6587\u4ef6\u52a0\u8f7d\u5230\u5185\u5b58\u4e2d\u7684\u60c5\u51b5\u4e0b\u8bbf\u95ee\u6587\u4ef6\u7684\u5185\u5bb9\u3002\u8fd9\u5bf9\u4e8e\u5904\u7406\u5927\u578b\u6570\u636e\u96c6\u975e\u5e38\u6709\u7528\uff0c\u56e0\u4e3a\u5b83\u53ef\u4ee5\u8282\u7701\u5185\u5b58\u7a7a\u95f4\uff0c\u5e76\u4e14\u53ef\u4ee5\u5feb\u901f\u8bbf\u95ee\u6587\u4ef6\u7684\u4efb\u610f\u90e8\u5206\u3002<\/strong><\/p>\n\n\n\n<p>\u5185\u5b58\u6620\u5c04\uff1a<strong>\u5c06\u4e00\u4e2a\u6587\u4ef6\u6216\u8005\u5176\u5b83\u5bf9\u8c61\u6620\u5c04\u5230\u8fdb\u7a0b\u7684\u5730\u5740\u7a7a\u95f4\uff0c\u5b9e\u73b0\u6587\u4ef6\u78c1\u76d8\u5730\u5740\u548c\u8fdb\u7a0b\u865a\u62df\u5730\u5740\u7a7a\u95f4\u4e2d\u4e00\u6bb5\u865a\u62df\u5730\u5740\u7684\u4e00\u4e00\u5bf9\u6620\u5173\u7cfb<\/strong>\u3002\u5b9e\u73b0\u8fd9\u6837\u7684\u6620\u5c04\u5173\u7cfb\u540e\uff0c\u8fdb\u7a0b\u5c31\u53ef\u4ee5\u91c7\u7528\u6307\u9488\u7684\u65b9\u5f0f\u8bfb\u5199\u64cd\u4f5c\u8fd9\u4e00\u6bb5\u5185\u5b58\uff0c\u800c\u7cfb\u7edf\u4f1a\u81ea\u52a8\u56de\u5199\u810f\u9875\u9762\u5230\u5bf9\u5e94\u7684\u6587\u4ef6\u78c1\u76d8\u4e0a\uff0c\u5373\u5b8c\u6210\u4e86\u5bf9\u6587\u4ef6\u7684\u64cd\u4f5c\u800c\u4e0d\u5fc5\u518d\u8c03\u7528 read\u3001write \u7b49\u7cfb\u7edf\u8c03\u7528\u51fd\u6570\u3002\u76f8\u53cd\uff0c\u5185\u6838\u7a7a\u95f4\u5bf9\u8fd9\u6bb5\u533a\u57df\u7684\u4fee\u6539\u4e5f\u76f4\u63a5\u53cd\u6620\u7528\u6237\u7a7a\u95f4\uff0c\u4ece\u800c\u53ef\u4ee5\u5b9e\u73b0\u4e0d\u540c\u8fdb\u7a0b\u95f4\u7684\u6587\u4ef6\u5171\u4eab\u3002<\/p>\n\n\n\n<p>\u4f7f\u7528\u5185\u5b58\u6620\u5c04\u6709\u4ee5\u4e0b\u51e0\u4e2a\u4f18\u70b9\uff1a<\/p>\n\n\n\n<ol><li>\u8282\u7701\u5185\u5b58\u7a7a\u95f4\uff1a\u901a\u8fc7\u5185\u5b58\u6620\u5c04\uff0c\u6211\u4eec\u53ef\u4ee5\u5728\u4e0d\u5c06\u6574\u4e2a\u6587\u4ef6\u52a0\u8f7d\u5230\u5185\u5b58\u4e2d\u7684\u60c5\u51b5\u4e0b\u8bbf\u95ee\u6587\u4ef6\u7684\u5185\u5bb9\u3002\u8fd9\u5bf9\u4e8e\u5904\u7406\u5927\u578b\u6570\u636e\u96c6\u975e\u5e38\u6709\u7528\uff0c\u56e0\u4e3a\u5b83\u53ef\u4ee5\u8282\u7701\u5927\u91cf\u7684\u5185\u5b58\u7a7a\u95f4\u3002<\/li><li>\u5feb\u901f\u8bbf\u95ee\u6587\u4ef6\u7684\u4efb\u610f\u90e8\u5206\uff1a\u7531\u4e8e\u5185\u5b58\u6620\u5c04\u5c06\u6587\u4ef6\u6620\u5c04\u5230\u5185\u5b58\u4e2d\uff0c\u6211\u4eec\u53ef\u4ee5\u5feb\u901f\u8bbf\u95ee\u6587\u4ef6\u7684\u4efb\u610f\u90e8\u5206\uff0c\u800c\u4e0d\u9700\u8981\u8bfb\u53d6\u6574\u4e2a\u6587\u4ef6\u3002\u8fd9\u5bf9\u4e8e\u968f\u673a\u8bbf\u95ee\u5927\u578b\u6587\u4ef6\u975e\u5e38\u6709\u7528\u3002<\/li><li>\u652f\u6301\u5e76\u53d1\u8bbf\u95ee\uff1a\u591a\u4e2a\u8fdb\u7a0b\u53ef\u4ee5\u540c\u65f6\u8bbf\u95ee\u5185\u5b58\u6620\u5c04\u6587\u4ef6\uff0c\u800c\u4e0d\u4f1a\u53d1\u751f\u51b2\u7a81\u3002\u8fd9\u4f7f\u5f97\u5185\u5b58\u6620\u5c04\u975e\u5e38\u9002\u5408\u591a\u8fdb\u7a0b\u7684\u6570\u636e\u5904\u7406\u4efb\u52a1\u3002<\/li><\/ol>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"726\" height=\"398\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/36f35bf612f82cfa97704322db640250.webp\" alt=\"\" class=\"wp-image-16620\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/36f35bf612f82cfa97704322db640250.webp 726w, http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/36f35bf612f82cfa97704322db640250-300x164.webp 300w\" sizes=\"(max-width: 726px) 100vw, 726px\" \/><\/figure>\n\n\n\n<p><a href=\"https:\/\/github.com\/DACUS1995\/pytorch-mmap-dataset\">https:\/\/github.com\/DACUS1995\/pytorch-mmap-dataset<\/a><\/p>\n\n\n\n<h3>3. <strong>\u4f18\u5316\u6570\u636e\u5b58\u50a8\u683c\u5f0f<\/strong><\/h3>\n\n\n\n<p>\u8003\u8651\u5c06\u6570\u636e\u5b58\u50a8\u4e3a\u5176\u4ed6\u683c\u5f0f\uff0c\u5982HDF5\u6216Parquet\uff0c\u8fd9\u4e9b\u683c\u5f0f\u652f\u6301\u66f4\u9ad8\u6548\u7684\u5206\u5757\u8bfb\u5199\u548c\u538b\u7f29\u3002\u4f8b\u5982\uff0c\u53ef\u4ee5\u4f7f\u7528<code>pandas<\/code>\u5c06JSONL\u6587\u4ef6\u8f6c\u6362\u4e3aParquet\u683c\u5f0f\uff0c\u7136\u540e\u4f7f\u7528<code>pandas<\/code>\u8bfb\u53d6\u5b83\u4eec\u3002<\/p>\n\n\n\n<h3>4. <strong>\u4f7f\u7528\u6570\u636e\u6d41\u5904\u7406<\/strong><\/h3>\n\n\n\n<p>\u4f7f\u7528\u751f\u6210\u5668\u9010\u884c\u8bfb\u53d6\u6570\u636e\uff0c\u800c\u4e0d\u662f\u5c06\u6574\u4e2a\u6587\u4ef6\u52a0\u8f7d\u5230\u5185\u5b58\u4e2d\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>def data_generator(file_path):\n    with open(file_path, 'r') as f:\n        for line in f:\n            yield json.loads(line)\n\n# \u5728 DataLoader \u4e2d\u4f7f\u7528\u751f\u6210\u5668\ndef collate_fn(batch):\n    # \u81ea\u5b9a\u4e49\u4f60\u7684\u6279\u5904\u7406\u64cd\u4f5c\n    return batch\n\ndataset = data_generator('data.jsonl')\ndataloader = DataLoader(dataset, batch_size=32, collate_fn=collate_fn)\n<\/code><\/pre>\n\n\n\n<h3>5. <strong>\u591a\u8fdb\u7a0b\u6570\u636e\u52a0\u8f7d<\/strong><\/h3>\n\n\n\n<p>\u4f7f\u7528<code>torch.utils.data.DataLoader<\/code>\u7684<code>num_workers<\/code>\u53c2\u6570\u6765\u5e76\u884c\u52a0\u8f7d\u6570\u636e\uff1a<\/p>\n\n\n\n<p>dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4)<\/p>\n\n\n\n<h3>6. <strong>\u6570\u636e\u9884\u5904\u7406<\/strong><\/h3>\n\n\n\n<p>\u5728\u6570\u636e\u52a0\u8f7d\u4e4b\u524d\u8fdb\u884c\u9884\u5904\u7406\uff0c\u5c06\u6570\u636e\u5904\u7406\u6210\u66f4\u7d27\u51d1\u7684\u683c\u5f0f\u6216\u8005\u5c06\u5176\u5212\u5206\u4e3a\u591a\u4e2a\u8f83\u5c0f\u7684\u6587\u4ef6\u8fdb\u884c\u5206\u6bb5\u52a0\u8f7d\u3002\u8fd9\u6837\u53ef\u4ee5\u51cf\u5c11\u6bcf\u6b21\u52a0\u8f7d\u7684\u6570\u636e\u91cf\u3002<\/p>\n\n\n\n<h3> 7. <strong>\u4f7f\u7528\u5206\u5757\u52a0\u8f7d<\/strong>\u3010\u6cd52\u3011 <\/h3>\n\n\n\n<p class=\"has-light-pink-background-color has-background\">      \u65b9\u6cd51 \u6bcf\u6b21\u8bfb\u53d6\u5355\u4e2a\u6570\u636e\uff0c\u90fd\u9700\u8981\u91cd\u65b0\u8bfb\u53d6\u4e00\u8fb9jsonl\u6587\u4ef6\uff0c\u5927\u5927\u589e\u52a0\u4e86\u6570\u636e\u52a0\u8f7d\u7684\u65f6\u95f4\uff0c\u4e3a\u4e86\u5c3d\u91cf\u4e0d\u5f71\u54cd\u6570\u636e\u52a0\u8f7d\u65f6\u95f4\uff0c\u6211\u4eec\u8003\u8651\u727a\u7272\u4e00\u90e8\u5206\u968f\u673a\u6027\u6765\u63d0\u9ad8\u901f\u5ea6\u3002<\/p>\n\n\n\n<p>     \u5177\u4f53\u65b9\u6cd5\u4e3a\uff1ajsonl\u6570\u636e\u88ab\u5206\u6210N\u4efd\uff0c\u5728\u8bad\u7ec31\u8f6e\u4e2d\uff0c\u6570\u636edatalaoder\u5148\u52a0\u8f7d\u7b2c\u4e00\u4efd\u7684jsonl\u6570\u636e\uff0c\u7136\u540epart1\u6570\u636e\u52a0\u8f7d\u8bad\u7ec3\u7ed3\u675f\u540e\uff0c\u7ee7\u7eed\u52a0\u8f7dpart2\u7684jsonl\u6570\u636e&#8230;..\u76f4\u5230\u6240\u6709\u7684jsonl\u6570\u636e\u52a0\u8f7d\u5b8c\u6210\uff0c\u8bad\u7ec31\u8f6e\u7ed3\u675f\u3002\u8fd9\u6837\u505a\u7684\u597d\u5904\u662f\u6bcf\u6b21batch\u4e0d\u9700\u8981\u91cd\u65b0\u8bfb\u53d6jsonl\uff0c\u4f46\u7f3a\u70b9\u5c31\u662f\u4e0d\u540cpart\u7684jsonl\u4e4b\u95f4\u6570\u636e\u4e0d\u4e92\u901a\uff0c\u6570\u636e\u7684\u968f\u673a\u6027\u964d\u4f4e\uff0c\u5177\u4f53\u4ee3\u7801\u5b9e\u73b0\u53c2\u8003\uff1a<a href=\"https:\/\/github.com\/modelscope\/FunASR\/blob\/main\/funasr\/datasets\/audio_datasets\/index_ds.py\">FunASR<\/a><\/p>\n\n\n\n<p>1\uff1a\u5728\u8bad\u7ec31\u4e2aepoch\u65f6\u5019\uff1a\u4f20\u9012data_split_num\u3010\u6570\u636e\u5206\u6210\u51e0\u4efd\u3011 data_split_i \u3010\u5f53\u524d\u7b2c\u51e0\u5206\u3011<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"1024\" height=\"453\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-3-1024x453.png\" alt=\"\" class=\"wp-image-16648\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-3-1024x453.png 1024w, http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-3-300x133.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-3-768x340.png 768w, http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-3.png 1087w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p>2\u3001datalaoder\u7684 build_iter\u4ee3\u7801\u5b9e\u73b0\uff1a\u672c\u8d28\u4e0a\u5c31\u662f \u91cd\u65b0\u6267\u884c <strong>torch.utils.data.Dataset\u3010\u53ef\u4ee5\u81ea\u5b9a\u4e49\u3011\u3001torch.utils.data.DataLoader\u3001\u4ee5\u53catorch.utils.data.distributed.DistributedSampler\u3010\u53ef\u4ee5\u81ea\u5df1\u5b9a\u4e49\u3011<\/strong> \uff0c\u9700\u8981\u5411 <strong>Dataset<\/strong> \u4f20\u9012 data_split_i  \u53c2\u6570\uff1b<\/p>\n\n\n\n<pre class=\"wp-block-preformatted\">    def build_iter(self, epoch=0, data_split_i=0, start_step=0, **kwargs):\n\n        # reload dataset slice\n        if self.data_split_num &gt; 1:\n            del self.dataset_tr\n           <strong> self.dataset_tr = self.dataset_class(\n                self.kwargs.get(\"train_data_set_list\"),\n                frontend=self.frontend,\n                tokenizer=self.tokenizer,\n                is_training=True,\n                **self.kwargs.get(\"dataset_conf\"),\n                data_split_i=data_split_i,\n            )<\/strong>\n\n        # dataloader\n        batch_sampler = self.kwargs[\"dataset_conf\"].get(\"batch_sampler\", \"BatchSampler\")\n        batch_sampler_val = None\n        if batch_sampler is not None:\n            batch_sampler_class = tables.batch_sampler_classes.get(batch_sampler)\n            batch_sampler = batch_sampler_class(\n                self.dataset_tr, start_step=start_step, **self.kwargs.get(\"dataset_conf\")\n            )\n            batch_sampler_val = batch_sampler_class(\n                self.dataset_val, is_training=False, **self.kwargs.get(\"dataset_conf\")\n            )\n\n        batch_sampler[\"batch_sampler\"].set_epoch(epoch)\n        batch_sampler_val[\"batch_sampler\"].set_epoch(epoch)\n        dataloader_tr = torch.utils.data.DataLoader(\n            self.dataset_tr, collate_fn=self.dataset_tr.collator, **batch_sampler\n        )\n        dataloader_val = torch.utils.data.DataLoader(\n            self.dataset_val, collate_fn=self.dataset_val.collator, **batch_sampler_val\n        )\n\n        return dataloader_tr, dataloader_val<\/pre>\n\n\n\n<p>3\u3001 <strong>Dataset<\/strong>  \u7684\u5177\u4f53\u5b9e\u73b0\uff1a<\/p>\n\n\n\n<p>\u53ef\u4ee5\u770b\u51fa\uff0cAudioDataset\u91cc\u9762\u5b9e\u9645\u4e0a\u5229\u7528\u7684index_ds\u6765\u5177\u4f53\u8bfb\u53d6jsonl\u6587\u4ef6\u5185\u5bb9\u7684\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"862\" height=\"546\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-4.png\" alt=\"\" class=\"wp-image-16657\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-4.png 862w, http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-4-300x190.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-4-768x486.png 768w\" sizes=\"(max-width: 862px) 100vw, 862px\" \/><\/figure>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"595\" height=\"277\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-5.png\" alt=\"\" class=\"wp-image-16658\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-5.png 595w, http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-5-300x140.png 300w\" sizes=\"(max-width: 595px) 100vw, 595px\" \/><\/figure>\n\n\n\n<p>4\u3001index_ds\u7684\u5b9e\u73b0\uff1a\u53ea\u8fd4\u56de\u90e8\u5206jsonl\u6570\u636e\uff0c\u867d\u7136\u51fd\u6570\u91cc\u9762\u52a0\u8f7d\u4e86\u6574\u4e2a\u6587\u4ef6\uff0c\u4f46\u51fd\u6570\u7ed3\u675ffile_list_all\u89e3\u91ca\u653e\u6389\u4e86\uff0c\u6700\u540e\u53ea\u6709file_list\u4e00\u76f4\u5728\u5360\u7528\u5185\u5b58\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"795\" height=\"216\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-6.png\" alt=\"\" class=\"wp-image-16660\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-6.png 795w, http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-6-300x82.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-6-768x209.png 768w\" sizes=\"(max-width: 795px) 100vw, 795px\" \/><\/figure>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"1024\" height=\"259\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-7-1024x259.png\" alt=\"\" class=\"wp-image-16661\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-7-1024x259.png 1024w, http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-7-300x76.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-7-768x194.png 768w, http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-7.png 1150w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<h3>8\u3001pytorch  <strong>pin_memory<\/strong>  \u8bbe\u7f6e\u4e3aFasle\u3010\u727a\u7272\u65f6\u95f4\u6362\u7a7a\u95f4\u3011<\/h3>\n\n\n\n<p class=\"has-light-pink-background-color has-background\"><strong>\u5728PyTorch\u4e2d\uff0c\u4f55\u65f6\u4f7f\u7528pin_memory\uff1f\u3010CPU\u5185\u5b58\u4e0d\u8db3\uff0c\u5efa\u8bae\u5173\u95ed\u8be5\u529f\u80fd\u3011 \u5f53\u8ba1\u7b97\u673a\u7684\u5185\u5b58\u5145\u8db3\u7684\u65f6\u5019\uff0c\u53ef\u4ee5\u8bbe\u7f6epin_memory=True\u3002\u5f53\u7cfb\u7edf\u5361\u4f4f\uff0c\u6216\u8005\u4ea4\u6362\u5185\u5b58\u4f7f\u7528\u8fc7\u591a\u7684\u65f6\u5019\uff0c\u8bbe\u7f6epin_memory=False\u3002<\/strong><\/p>\n\n\n\n<p><strong>pin_memory\u5c31\u662f\u9501\u9875\u5185\u5b58\uff0c\u521b\u5efaDataLoader\u65f6\uff0c\u8bbe\u7f6epin_memory=True\uff0c\u5219\u610f\u5473\u7740\u751f\u6210\u7684Tensor\u6570\u636e\u6700\u5f00\u59cb\u662f\u5c5e\u4e8e\u5185\u5b58\u4e2d\u7684\u9501\u9875\u5185\u5b58\uff0c\u8fd9\u6837\u5c06\u5185\u5b58\u7684Tensor\u8f6c\u4e49\u5230GPU\u7684\u663e\u5b58\u5c31\u4f1a\u66f4\u5feb\u4e00\u4e9b\u3002pin_memory=False\u8868\u793a\u5c06load\u8fdb\u6570\u636e\u653e\u81f3\u975e\u9501\u9875\u5185\u5b58\u533a\uff0c\u901f\u5ea6\u4f1a\u8f83\u6162\u3002<\/strong><\/p>\n\n\n\n<p>\u5f53\u8ba1\u7b97\u673a\u7684\u5185\u5b58\u5145\u8db3\u7684\u65f6\u5019\uff0c\u8bbe\u7f6epin_memory=True\u3002\u5f53\u7cfb\u7edf\u5361\u4f4f\uff0c\u6216\u8005\u4ea4\u6362\u5185\u5b58\u4f7f\u7528\u8fc7\u591a\u7684\u65f6\u5019\uff0c\u8bbe\u7f6epin_memory=False\u3002<\/p>\n\n\n\n<p><strong>\u4e3b\u673a\u4e2d\u7684\u5185\u5b58\uff0c\u6709\u4e24\u79cd\u5b58\u5728\u65b9\u5f0f: \u4e00\u662f\u9501\u9875\uff0c\u4e8c\u662f\u4e0d\u9501\u9875\uff0c<\/strong><\/p>\n\n\n\n<p><strong>\u9501\u9875\u5185\u5b58\u5b58\u653e\u7684\u5185\u5bb9\u5728\u4efb\u4f55\u60c5\u51b5\u4e0b\u90fd\u4e0d\u4f1a\u4e0e\u4e3b\u673a\u7684\u865a\u62df\u5185\u5b58\u8fdb\u884c\u4ea4\u6362\uff08\u6ce8\uff1a\u865a\u62df\u5185\u5b58\u5c31\u662f\u786c\u76d8\uff09\uff0c\u800c\u4e0d\u9501\u9875\u5185\u5b58\u5728\u4e3b\u673a\u5185\u5b58\u4e0d\u8db3\u65f6\uff0c\u6570\u636e\u4f1a\u5b58\u653e\u5728\u865a\u62df\u5185\u5b58\u4e2d\u3002\u663e\u5361\u4e2d\u7684\u663e\u5b58\u5168\u90e8\u662f\u9501\u9875\u5185\u5b58,\u5f53\u8ba1\u7b97\u673a\u7684\u5185\u5b58\u5145\u8db3\u7684\u65f6\u5019\uff0c\u53ef\u4ee5\u8bbe\u7f6epin_memory=True\u3002<\/strong><\/p>\n\n\n\n<p>\u5728\u4f7f\u7528PyTorch\u8fdb\u884c\u6570\u636e\u52a0\u8f7d\u65f6\uff0cpin_memory\u662f\u4e00\u4e2a\u53ef\u9009\u7684\uff0c\u5b83\u901a\u5e38\u7528\u4e8e\u5c06\u6570\u636e\u5b58\u50a8\u5728\u4e3b\u673a\u5185\u5b58\uff08RAM\uff09\u4e2d\u7684\u56fa\u5b9a\u5185\u5b58\u9875\uff08pinned memory\uff09\u4e0a\uff0c\u4ee5\u4fbf\u66f4\u9ad8\u6548\u5730\u5c06\u6570\u636e\u4f20\u8f93\u5230GPU\u5185\u5b58\u3002<\/p>\n\n\n\n<p class=\"has-bright-blue-background-color has-background\"><strong>\u4e3b\u8981\u4f5c\u7528\u5982\u4e0b\uff1a<\/strong><\/p>\n\n\n\n<ol><li>\u63d0\u9ad8\u6570\u636e\u4f20\u8f93\u6548\u7387\uff1a\u5f53\u4f7f\u7528GPU\u8fdb\u884c\u8bad\u7ec3\u65f6\uff0c\u901a\u5e38\u9700\u8981\u5c06\u6570\u636e\u4ece\u4e3b\u673a\u5185\u5b58\u4f20\u8f93\u5230GPU\u5185\u5b58\u3002<strong>\u4f7f\u7528pin_memory\u53ef\u4ee5\u5c06\u6570\u636e\u5b58\u50a8\u5728\u56fa\u5b9a\u5185\u5b58\u9875\u4e2d\uff0c\u51cf\u5c11\u6570\u636e\u4f20\u8f93\u7684\u65f6\u95f4\u548c\u5f00\u9500\uff0c\u63d0\u9ad8\u6570\u636e\u4f20\u8f93\u7684\u6548\u7387\u3002<\/strong><\/li><li>\u51cf\u5c11\u6570\u636e\u4f20\u8f93\u5ef6\u8fdf\uff1a\u4e3b\u673a\u5185\u5b58\u548cGPU\u5185\u5b58\u4e4b\u95f4\u7684\u6570\u636e\u4f20\u8f93\u901a\u5e38\u6d89\u53ca\u5185\u5b58\u62f7\u8d1d\u64cd\u4f5c\uff0c\u800c\u5185\u5b58\u62f7\u8d1d\u662f\u4e00\u9879\u76f8\u5bf9\u8f83\u6162\u7684\u64cd\u4f5c\u3002<strong>pin_memory\u53ef\u4ee5\u5728\u6570\u636e\u52a0\u8f7d\u65f6\u5c06\u6570\u636e\u76f4\u63a5\u5b58\u653e\u5728\u56fa\u5b9a\u5185\u5b58\u9875\u4e2d\uff0c\u907f\u514d\u4e0d\u5fc5\u8981\u7684\u5185\u5b58\u62f7\u8d1d\u8fc7\u7a0b\uff0c\u4ece\u800c\u51cf\u5c11\u6570\u636e\u4f20\u8f93\u7684\u5ef6\u8fdf\u3002<\/strong><\/li><\/ol>\n\n\n\n<p><strong>\u9700\u8981\u6ce8\u610f\u7684\u662f\uff0c\u4f7f\u7528pin_memory\u4f1a\u5360\u7528\u989d\u5916\u7684\u4e3b\u673a\u5185\u5b58\uff0c\u5e76\u4e14\u53ea\u5728\u4f7f\u7528CUDA\u8bbe\u5907\u7684\u60c5\u51b5\u4e0b\u624d\u6709\u6548\u679c\u3002<\/strong><\/p>\n\n\n\n<figure class=\"wp-block-image\"><img src=\"https:\/\/picx.zhimg.com\/80\/v2-3c34b5f3aa1e0029fc1bc83179975eff_1440w.webp\" alt=\"\"\/><\/figure>\n\n\n\n<blockquote class=\"wp-block-quote\"><p>\u9501\u9875\u5185\u5b58\u548cGPU\u663e\u5b58\u4e4b\u95f4\u7684\u62f7\u8d1d\u901f\u5ea6\u5927\u7ea6\u662f6GB\/s<br>\u53ef\u5206\u9875\u5185\u5b58\u548cGPU\u663e\u5b58\u95f4\u7684\u62f7\u8d1d\u901f\u5ea6\u5927\u7ea6\u662f3GB\/s\u3002<br>GPU\u5185\u5b58\u95f4\u901f\u5ea6\u662f30GB\/s,CPU\u95f4\u5185\u5b58\u901f\u5ea6\u662f10GB\/s<\/p><\/blockquote>\n\n\n\n<p>\u901a\u5e38\u6211\u4eec\u7684\u4e3b\u673a\u5904\u7406\u5668\u662f\u652f\u6301\u865a\u62df\u5185\u5b58\u7cfb\u7edf\u7684\uff0c\u4e5f\u5c31\u662f\u4f7f\u7528\u786c\u76d8\u7a7a\u95f4\u6765\u4ee3\u66ff\u5185\u5b58\u3002\u5927\u591a\u6570\u7cfb\u7edf\u4e2d\u865a\u62df\u5185\u5b58\u7a7a\u95f4\u88ab\u5212\u5206\u6210\u8bb8\u591a\u9875\uff0c\u5b83\u4eec\u662f\u5bfb\u5740\u7684\u5355\u5143\uff0c\u9875\u7684\u5927\u5c0f\u81f3\u5c11\u662f4096\u4e2a\u5b57\u8282\u3002\u865a\u62df\u5bfb\u5740\u80fd\u4f7f\u4e00\u4e2a\u8fde\u7eed\u7684\u865a\u62df\u5730\u5740\u7a7a\u95f4\u6620\u5c04\u5230\u7269\u7406\u5185\u5b58\u5e76\u4e0d\u8fde\u7eed\u7684\u4e00\u4e9b\u9875\u3002<\/p>\n\n\n\n<p>\u5982\u679c\u67d0\u9875\u7684\u7269\u7406\u5185\u5b58\u88ab\u6807\u8bb0\u4e3a<strong>\u6362\u51fa<\/strong>\u72b6\u6001\uff0c\u5b83\u5c31\u53ef\u4ee5\u88ab\u66f4\u6362\u5230\u78c1\u76d8\u4e0a\uff0c\u4e5f\u5c31\u662f\u8bf4\u88ab\u8e22\u51fa\u5185\u5b58\u4e86\u3002\u5982\u679c\u4e0b\u6b21\u9700\u8981\u8be5\u9875\u4e86\uff0c\u5219\u91cd\u65b0\u52a0\u8f7d\u5230\u5185\u5b58\u91cc\u3002\u663e\u7136\u5982\u679c\u8fd9\u4e00\u9875\u5207\u6362\u7684\u975e\u5e38\u9891\u7e41\uff0c\u90a3\u4e48\u4f1a\u6d6a\u8d39\u4e0d\u5c11\u65f6\u95f4\u3002<\/p>\n\n\n\n<p>\u9501\u9875(pinned page)\u662f\u64cd\u4f5c\u7cfb\u7edf\u5e38\u7528\u7684\u64cd\u4f5c\uff0c\u5c31\u662f\u4e3a\u4e86\u4f7f\u786c\u4ef6\u5916\u8bbe\u76f4\u63a5\u8bbf\u95eeCPU\u5185\u5b58\uff0c\u4ece\u800c\u907f\u514d\u8fc7\u591a\u7684\u590d\u5236\u64cd\u4f5c\u3002\u88ab\u9501\u5b9a\u7684\u9875\u9762\u4f1a\u88ab\u64cd\u4f5c\u7cfb\u7edf\u6807\u8bb0\u4e3a<strong>\u4e0d\u53ef\u88ab\u6362\u51fa\u7684<\/strong>\uff0c\u6240\u4ee5\u8bbe\u5907\u9a71\u52a8\u7a0b\u5e8f\u7ed9\u8fd9\u4e9b\u5916\u8bbe\u7f16\u7a0b\u65f6\uff0c\u53ef\u4ee5\u4f7f\u7528\u9875\u9762\u7684\u7269\u7406\u5730\u5740\u76f4\u63a5\u8bbf\u95ee\u5185\u5b58\uff0cCPU\u4e5f\u53ef\u4ee5\u8bbf\u95ee\u4e0a\u8ff0\u9501\u9875\u5185\u5b58\uff0c\u4f46\u662f\u6b64\u5185\u5b58\u662f\u4e0d\u80fd\u79fb\u52a8\u6216\u6362\u9875\u5230\u78c1\u76d8\u4e0a\u7684\u3002\u53e6\u5916\uff0c\u5728GPU\u4e0a\u5206\u914d\u7684\u5185\u5b58\u9ed8\u8ba4\u90fd\u662f\u9501\u9875\u5185\u5b58\uff0c\u8fd9\u53ea\u662f\u56e0\u4e3aGPU\u4e0d\u652f\u6301\u5c06\u5185\u5b58\u4ea4\u6362\u5230\u78c1\u76d8\u4e0a\u3002<\/p>\n\n\n\n<p>Host\uff08\u4f8b\u5982CPU\uff09\u7684\u6570\u636e\u5206\u914d\u9ed8\u8ba4\u662f**pageable(\u53ef\u5206\u9875\u7684)**\uff0c\u4f46\u662fGPU\u662f\u6ca1\u6cd5\u76f4\u63a5\u8bfb\u53d6pageable\u5185\u5b58\u91cc\u7684\u6570\u636e\u7684\uff0c\u6240\u4ee5\u9700\u8981\u5148\u521b\u5efa\u4e00\u4e2a\u4e34\u65f6\u7684\u7f13\u51b2\u533a\uff08pinned memory\uff09\uff0c\u628a\u6570\u636e\u4ecepageable\u5185\u5b58\u62f7\u8d1dpinned\u5185\u5b58\u4e0a\uff0c\u7136\u540eGPU\u624d\u80fd\u4ecepinned\u5185\u5b58\u4e0a\u8bfb\u53d6\u6570\u636e\uff0c\u5982\u4e0a\u56fe\uff08\u5de6\uff09\u6240\u793a\u3002<\/p>\n\n\n\n<h3>9\u3001number_works\u964d\u4f4e\u53c2\u6570\u503c<\/h3>\n\n\n\n<p><strong>\u4ece\u78c1\u76d8\u52a0\u8f7d\u6570\u636e\u5230 host \u7684page-locked\u5185\u5b58. \u91c7\u7528\u591a\u4e2a worker \u8fdb\u7a0b\u5e76\u884c\u5730\u6570\u636e\u52a0\u8f7d ,\u4f1a\u589e\u52a0\u5185\u5b58\u5360\u7528\uff0c\u56e0\u6b64\u4e3a\u4e86\u964d\u4f4e\u5185\u5b58\u5360\u7528\uff0c\u53ef\u4ee5\u8003\u8651number_work\u4ece\u4f4e\u5230\u9ad8\u8bbe\u7f6e\uff1a2\u30014\u30018\u300116\uff0c\u77e5\u9053\u8bad\u7ec3\u901f\u5ea6\u8fbe\u5230\u6700\u4f18\u3002<\/strong><\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"823\" height=\"102\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-10.png\" alt=\"\" class=\"wp-image-16766\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-10.png 823w, http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-10-300x37.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-10-768x95.png 768w\" sizes=\"(max-width: 823px) 100vw, 823px\" \/><\/figure>\n\n\n\n<p><strong>\u6bcf\u4e2a\u8fdb\u7a0b\u7684<code>num_workers<\/code><\/strong>: \u6bcf\u4e2a\u5206\u5e03\u5f0f\u8fdb\u7a0b\uff08\u5373\u6bcf\u4e2aGPU\uff09\u90fd\u6709\u81ea\u5df1\u7684\u6570\u636e\u52a0\u8f7d\u5b50\u8fdb\u7a0b\u3002\u8fd9\u610f\u5473\u7740\u603b\u7684<code>num_workers<\/code>\u4f1a\u662f\u6bcf\u4e2aGPU\u4e0a<code>num_workers<\/code>\u7684\u503c\u4e58\u4ee5GPU\u7684\u6570\u91cf\uff08\u5206\u5e03\u5f0f\u8fdb\u7a0b\u6570\uff09\u3002<\/p>\n\n\n\n<p><strong>\u4f8b\u5982\uff0c\u5982\u679c\u67094\u4e2aGPU\uff0c\u5e76\u4e14\u6bcf\u4e2aGPU\u7684<code>num_workers<\/code>\u8bbe\u7f6e\u4e3a4\uff0c\u90a3\u4e48\u603b\u7684\u5de5\u4f5c\u8fdb\u7a0b\u6570\u5c06\u662f4 * 4 = 16\u3002<\/strong><\/p>\n\n\n\n<p><strong>\u907f\u514d\u6570\u636e\u91cd\u53e0<\/strong>: \u5728\u5206\u5e03\u5f0f\u8bad\u7ec3\u4e2d\uff0c\u9700\u8981\u786e\u4fdd\u6bcf\u4e2a\u8fdb\u7a0b\u5904\u7406\u7684\u6570\u636e\u5b50\u96c6\u662f\u4e0d\u540c\u7684\u3002\u4f7f\u7528<code>DistributedSampler<\/code>\u53ef\u4ee5\u786e\u4fdd\u6570\u636e\u5728\u5404\u4e2a\u8fdb\u7a0b\u95f4\u5747\u5300\u5206\u914d\uff0c\u4ece\u800c\u907f\u514d\u6570\u636e\u91cd\u590d\u548c\u4e22\u5931\u3002<\/p>\n\n\n\n<p><strong>\u540c\u6b65\u548c\u901a\u4fe1\u5f00\u9500<\/strong>: \u589e\u52a0<code>num_workers<\/code>\u7684\u6570\u91cf\u53ef\u80fd\u4f1a\u589e\u52a0\u8fdb\u7a0b\u95f4\u7684\u540c\u6b65\u548c\u901a\u4fe1\u5f00\u9500\uff0c\u7279\u522b\u662f\u5728\u591aGPU\u7684\u60c5\u51b5\u4e0b\u3002\u9700\u8981\u6839\u636e\u5177\u4f53\u7684\u786c\u4ef6\u914d\u7f6e\u548c\u6570\u636e\u96c6\u5927\u5c0f\u6765\u8c03\u6574<code>num_workers<\/code>\u3002<\/p>\n\n\n\n<p><strong>\u5185\u5b58\u548cCPU\u8d44\u6e90<\/strong>: \u6bcf\u589e\u52a0\u4e00\u4e2a\u5de5\u4f5c\u8fdb\u7a0b\uff0c\u90fd\u4f1a\u6d88\u8017\u989d\u5916\u7684CPU\u8d44\u6e90\u548c\u5185\u5b58\u3002\u786e\u4fdd\u4f60\u7684\u7cfb\u7edf\u6709\u8db3\u591f\u7684\u8d44\u6e90\u6765\u652f\u6301\u8bbe\u7f6e\u7684<code>num_workers<\/code>\u503c\u3002<\/p>\n\n\n\n<p><strong>\u5728\u7ed9Dataloader\u8bbe\u7f6eworker\u6570\u91cf\uff08num_worker\uff09\u65f6\uff0c\u5230\u5e95\u8bbe\u7f6e\u591a\u5c11\u5408\u9002\uff1f\u8fd9\u4e2aworker\u5230\u5e95\u600e\u4e48\u5de5\u4f5c\u7684\uff1f<\/strong><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)\n<\/code><\/pre>\n\n\n\n<p><a><\/a><a><\/a>\u53c2\u6570\u8be6\u89e3\uff1a<\/p>\n\n\n\n<p class=\"has-light-pink-background-color has-background\">    \u6bcf\u6b21dataloader\u52a0\u8f7d\u6570\u636e\u65f6\uff1adataloader\u4e00\u6b21\u6027\u521b\u5efanum_worker\u4e2aworker\uff0c\uff08\u4e5f\u53ef\u4ee5\u8bf4dataloader\u4e00\u6b21\u6027\u521b\u5efanum_worker\u4e2a\u5de5\u4f5c\u8fdb\u7a0b\uff0cworker\u4e5f\u662f\u666e\u901a\u7684\u5de5\u4f5c\u8fdb\u7a0b\uff09\uff0c\u5e76\u7528batch_sampler\u5c06\u6307\u5b9a\u7b2c\u51e0\u4e2abatch\u5206\u914d\u7ed9\u6307\u5b9aworker\uff0cworker\u5c06\u5b83\u8d1f\u8d23\u7684batch\u52a0\u8f7d\u8fdbRAM\u3002<\/p>\n\n\n\n<p>\u7136\u540e\uff0cdataloader\u4eceRAM\u4e2d\u627e\u672c\u8f6e\u8fed\u4ee3\u8981\u7528\u7684batch\uff0c\u5982\u679c\u627e\u5230\u4e86\uff0c\u5c31\u4f7f\u7528\u3002\u5982\u679c\u6ca1\u627e\u5230\uff0c\u5c31\u8981num_worker\u4e2aworker\u7ee7\u7eed\u52a0\u8f7dbatch\u5230\u5185\u5b58\uff0c\u76f4\u5230dataloader\u5728RAM\u4e2d\u627e\u5230\u76ee\u6807batch\u3002\u4e00\u822c\u60c5\u51b5\u4e0b\u90fd\u662f\u80fd\u627e\u5230\u7684\uff0c\u56e0\u4e3abatch_sampler\u6307\u5b9abatch\u65f6\u5f53\u7136\u4f18\u5148\u6307\u5b9a\u672c\u8f6e\u8981\u7528\u7684batch\u3002<\/p>\n\n\n\n<p class=\"has-light-pink-background-color has-background\">num_worker\u8bbe\u7f6e\u5f97\u5927\uff0c\u597d\u5904\u662f\u5bfbbatch\u901f\u5ea6\u5feb\uff0c\u56e0\u4e3a\u4e0b\u4e00\u8f6e\u8fed\u4ee3\u7684batch\u5f88\u53ef\u80fd\u5728\u4e0a\u4e00\u8f6e\/\u4e0a\u4e0a\u4e00\u8f6e\u2026\u8fed\u4ee3\u65f6\u5df2\u7ecf\u52a0\u8f7d\u597d\u4e86\u3002\u574f\u5904\u662f\u5185\u5b58\u5f00\u9500\u5927\uff0c\u4e5f\u52a0\u91cd\u4e86CPU\u8d1f\u62c5\uff08worker\u52a0\u8f7d\u6570\u636e\u5230RAM\u7684\u8fdb\u7a0b\u662fCPU\u590d\u5236\u7684\u561b\uff09\u3002num_workers\u7684\u7ecf\u9a8c\u8bbe\u7f6e\u503c\u662f\u81ea\u5df1\u7535\u8111\/\u670d\u52a1\u5668\u7684CPU\u6838\u5fc3\u6570\uff0c\u5982\u679cCPU\u5f88\u5f3a\u3001RAM\u4e5f\u5f88\u5145\u8db3\uff0c\u5c31\u53ef\u4ee5\u8bbe\u7f6e\u5f97\u66f4\u5927\u4e9b\u3002<\/p>\n\n\n\n<p>\u5982\u679cnum_worker\u8bbe\u4e3a0\uff0c\u610f\u5473\u7740\u6bcf\u4e00\u8f6e\u8fed\u4ee3\u65f6\uff0cdataloader\u4e0d\u518d\u6709\u81ea\u4e3b\u52a0\u8f7d\u6570\u636e\u5230RAM\u8fd9\u4e00\u6b65\u9aa4\uff08\u56e0\u4e3a\u6ca1\u6709worker\u4e86\uff09\uff0c\u800c\u662f\u5728RAM\u4e2d\u627ebatch\uff0c\u627e\u4e0d\u5230\u65f6\u518d\u52a0\u8f7d\u76f8\u5e94\u7684batch\u3002\u7f3a\u70b9\u5f53\u7136\u662f\u901f\u5ea6\u66f4\u6162\u3002<\/p>\n\n\n\n<ol><li><strong>\u6839\u636e\u786c\u4ef6\u914d\u7f6e\u8c03\u6574<\/strong>: \u5728\u591a\u6838 CPU \u73af\u5883\u4e0b\uff0c\u8bbe\u7f6e\u8f83\u9ad8\u7684 <code>num_workers<\/code>\uff08\u5982 4 \u5230 16\uff09\u53ef\u4ee5\u6709\u6548\u5229\u7528\u591a\u6838\u8d44\u6e90\uff0c\u63d0\u9ad8\u6570\u636e\u52a0\u8f7d\u901f\u5ea6\u3002\u5177\u4f53\u7684\u6700\u4f73\u503c\u9700\u8981\u6839\u636e\u7cfb\u7edf\u7684 CPU \u6838\u5fc3\u6570\u548c\u5185\u5b58\u60c5\u51b5\u6765\u8c03\u6574\u3002<\/li><li><strong>\u6570\u636e\u52a0\u8f7d\u74f6\u9888<\/strong>: \u5982\u679c\u4f60\u53d1\u73b0\u8bad\u7ec3\u65f6 GPU \u7ecf\u5e38\u5904\u4e8e\u7b49\u5f85\u6570\u636e\u7684\u72b6\u6001\uff0c\u8fd9\u53ef\u80fd\u662f\u56e0\u4e3a\u6570\u636e\u52a0\u8f7d\u6210\u4e3a\u4e86\u74f6\u9888\u3002\u589e\u52a0 <code>num_workers<\/code> \u53ef\u4ee5\u5e2e\u52a9\u7f13\u89e3\u8fd9\u4e00\u95ee\u9898\u3002<\/li><li><strong>\u7cfb\u7edf\u8d1f\u8f7d<\/strong>: \u5728\u67d0\u4e9b\u60c5\u51b5\u4e0b\uff0c\u8bbe\u7f6e\u8fc7\u9ad8\u7684 <code>num_workers<\/code> \u53ef\u80fd\u4f1a\u5bfc\u81f4\u7cfb\u7edf\u8d1f\u8f7d\u8fc7\u9ad8\uff0c\u5f71\u54cd\u5176\u4ed6\u4efb\u52a1\u6216\u6574\u4f53\u7cfb\u7edf\u6027\u80fd\u3002\u56e0\u6b64\u9700\u8981\u627e\u5230\u4e00\u4e2a\u5e73\u8861\u70b9\u3002<\/li><li><strong>\u5b9e\u9a8c\u8c03\u6574<\/strong>: \u5b9e\u9645\u5e94\u7528\u4e2d\uff0c\u6700\u597d\u7684\u505a\u6cd5\u662f\u4ece\u8f83\u5c0f\u7684\u503c\u5f00\u59cb\uff08\u5982 2 \u6216 4\uff09\uff0c\u7136\u540e\u9010\u6b65\u589e\u52a0\uff0c\u89c2\u5bdf\u8bad\u7ec3\u8fc7\u7a0b\u4e2d\u7684\u6570\u636e\u52a0\u8f7d\u901f\u5ea6\u548c\u7cfb\u7edf\u8d44\u6e90\u4f7f\u7528\u60c5\u51b5\uff0c\u4ece\u800c\u786e\u5b9a\u6700\u4f73\u8bbe\u7f6e\u3002<\/li><\/ol>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"1024\" height=\"544\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/7d5b26c9be26e66c-1024x544.png\" alt=\"\" class=\"wp-image-16733\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/7d5b26c9be26e66c-1024x544.png 1024w, http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/7d5b26c9be26e66c-300x159.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/7d5b26c9be26e66c-768x408.png 768w, http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/7d5b26c9be26e66c.png 1269w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p>DistributedDataParallel \u6d88\u9664\u4e86 DataParallel \u4e2d\u4e0a\u8ff0\u4e0d\u8db3. \u5176\u4e0d\u518d\u9700\u8981\u4e3b GPU\uff0c\u6bcf\u4e2a GPU \u5206\u522b\u8fdb\u884c\u5404\u81ea\u4efb\u52a1. \u6bcf\u4e2a GPU \u4e0a\u7684\u8bad\u7ec3\u662f\u5176\u72ec\u7acb\u8fdb\u7a0b\uff0c\u800c\u5728 DataParallel \u4e2d\u662f\u91c7\u7528\u591a\u7ebf\u7a0b(multi-thread) \u7684.<\/p>\n\n\n\n<p>DistributedDataParallel \u7684\u5de5\u4f5c\u8fc7\u7a0b\u5982\uff0c<\/p>\n\n\n\n<p>[1] &#8211; <strong>\u4ece\u78c1\u76d8\u52a0\u8f7d\u6570\u636e\u5230 host \u7684page-locked\u5185\u5b58. \u91c7\u7528\u591a\u4e2a worker \u8fdb\u7a0b\u5e76\u884c\u5730\u6570\u636e\u52a0\u8f7d\uff1b\u5176\u4e2d\uff0cdistributed data sampler \u786e\u4fdd\u4e86\u52a0\u8f7d\u7684\u6570\u636e\u5728\u8de8\u8fdb\u7a0b\u95f4\u662f\u4e0d\u91cd\u53e0\u7684.<\/strong><\/p>\n\n\n\n<p>[2] &#8211; \u5c06 mini-batch \u6570\u636e\u7531 page-locked \u5185\u5b58\u8f6c\u79fb\u5230 GPU. \u4e0d\u9700\u8981\u4efb\u4f55\u6570\u636e\u5e7f\u64ad. \u56e0\u4e3a\u6bcf\u4e2a GPU \u5206\u522b\u6709\u6a21\u578b\u526f\u672c\uff0c\u56e0\u6b64\u4e5f\u4e0d\u9700\u8981\u6a21\u578b\u5e7f\u64ad.<\/p>\n\n\n\n<p>[3] &#8211; \u5206\u522b\u5728\u5404 GPU \u72ec\u7acb\u8fdb\u884c\u524d\u5411\u8ba1\u7b97\u548c\u635f\u5931\u51fd\u6570\u8ba1\u7b97. \u56e0\u6b64\uff0c\u4e5f\u4e0d\u9700\u8981\u6536\u96c6\u5404 GPUs \u7684\u8f93\u51fa.<\/p>\n\n\n\n<p>[4] &#8211; \u540e\u5411\u68af\u5ea6\u8ba1\u7b97\uff0c\u68af\u5ea6\u662f\u8de8GPUs all-reduced\u7684. \u786e\u4fdd\u5728\u540e\u5411\u4f20\u64ad\u7ed3\u675f\u65f6\uff0c\u6bcf\u4e2a GPU \u6700\u7ec8\u5f97\u5230\u76f8\u540c\u7684\u5e73\u5747\u68af\u5ea6\u7684\u526f\u672c.<\/p>\n\n\n\n<p>[5] &#8211; \u66f4\u65b0\u6a21\u578b\u53c2\u6570. \u7531\u4e8e\u6bcf\u4e2a GPU \u662f\u7531\u76f8\u540c\u7684\u6a21\u578b\u526f\u672c\u5f00\u59cb\u7684\uff0c\u4e14\u68af\u5ea6\u662f all-reduced \u7684\uff0c\u56e0\u6b64\u6240\u6709 GPUs \u4e0a\u7684\u6743\u91cd\u66f4\u65b0\u662f\u76f8\u540c\u7684\uff0c\u65e0\u9700\u518d\u8fdb\u884c\u6a21\u578b\u540c\u6b65.<\/p>\n\n\n\n<p>\u4ee5\u4e0a\u5373\u5b8c\u6210\u4e86\u4e00\u6b21\u8fed\u4ee3. \u8fd9\u79cd\u8bbe\u8ba1\u786e\u4fdd\u4e86\u6a21\u578b\u53c2\u6570\u7684\u66f4\u65b0\u662f\u76f8\u540c\u7684\uff0c\u56e0\u6b64\u6d88\u9664\u4e86\u6bcf\u6b21\u5f00\u59cb\u65f6\u7684\u6a21\u578b\u540c\u6b65.<\/p>\n\n\n\n<h2><strong> <\/strong>B \u3001NCCL\u901a\u4fe1\u8d85\u65f6\u95ee\u9898<\/h2>\n\n\n\n<p>[PG 1 Rank 9] Timeout at NCCL work: 957, last enqueued NCCL work: 957, last completed NCCL work: 956.<br>[rank9]:[E ProcessGroupNCCL.cpp:577] [Rank 9] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted\/incomplete data.<br>[rank9]:[E ProcessGroupNCCL.cpp:583] [Rank 9] To avoid data inconsistency, we are taking the entire process down.<\/p>\n\n\n\n<p><strong>\u8fd9\u79cd\u62a5\u9519\u9700\u8981\u5177\u4f53\u60c5\u51b5\u5177\u4f53\u5206\u6790<\/strong><\/p>\n\n\n\n<h3>1\u3001\u5c1d\u8bd5\u589e\u52a0NCCL \u8d85\u65f6\u65f6\u95f4\/\u8bbe\u7f6e\u8fc7NCCL\u53d8\u91cf<\/h3>\n\n\n\n<p class=\"has-light-pink-background-color has-background\">\u5982\u4f55\u8bbe\u7f6e\uff1a<\/p>\n\n\n\n<p>1\u3001\u67e5\u770b\u53d8\u91cf\uff1a\u67e5\u770b\u73af\u5883\u53d8\u91cf <code>NCCL_IB_TIMEOUT<\/code> \u7684\u503c<\/p>\n\n\n\n<p>echo $NCCL_IB_TIMEOUT # \u5982\u679c\u73af\u5883\u53d8\u91cf\u5df2\u8bbe\u7f6e\uff0c\u8fd9\u4e2a\u547d\u4ee4\u5c06\u663e\u793a\u5176\u503c\uff1b\u5982\u679c\u6ca1\u6709\u8bbe\u7f6e\uff0c\u5219\u4e0d\u4f1a\u6709\u4efb\u4f55\u8f93\u51fa\u3002<\/p>\n\n\n\n<p><code>printenv<\/code> \u547d\u4ee4\u53ef\u4ee5\u663e\u793a\u6240\u6709\u73af\u5883\u53d8\u91cf\u7684\u503c\uff0c\u4e5f\u53ef\u4ee5\u67e5\u770b\u7279\u5b9a\u7684\u73af\u5883\u53d8\u91cf\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>printenv NCCL_IB_TIMEOUT #\u5982\u679c\u73af\u5883\u53d8\u91cf\u672a\u8bbe\u7f6e\uff0c\u8be5\u547d\u4ee4\u4e0d\u4f1a\u8f93\u51fa\u4efb\u4f55\u5185\u5bb9\u3002<\/code><\/pre>\n\n\n\n<p>\u4e5f\u53ef\u4ee5\u4f7f\u7528 <code>env<\/code> \u547d\u4ee4\u6765\u5217\u51fa\u6240\u6709\u73af\u5883\u53d8\u91cf\uff0c\u5e76\u67e5\u627e <code>NCCL_IB_TIMEOUT<\/code>\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-preformatted\"><code>env | grep NCCL_IB_TIMEOUT<\/code><\/pre>\n\n\n\n<p>NCCL\u76f8\u5173\u73af\u5883\u53d8\u91cf\u8bf4\u660e \u3010https:\/\/docs.nvidia.com\/deeplearning\/nccl\/user-guide\/docs\/usage.html\u3011<\/p>\n\n\n\n<ol><li><strong>NCCL_TIMEOUT:\u8bbe\u7f6e\u96c6\u5408\u64cd\u4f5c\u8d85\u65f6\u9608\u503c,\u5355\u4f4d\u6beb\u79d2\uff1b\u5982\u679c\u5e38\u89c1\u8d85\u65f6\u9519\u8bef,\u9002\u5f53\u589e\u5927\u8be5\u503c,\u4f46\u4e0d\u80fd\u592a\u5927<\/strong>\u3002<code>NCCL_TIMEOUT<\/code> \u73af\u5883\u53d8\u91cf\u7528\u4e8e\u8bbe\u7f6e NCCL \u96c6\u4f53\u901a\u4fe1\u64cd\u4f5c\u7684\u8d85\u65f6\u65f6\u95f4\u3002\u901a\u8fc7\u8c03\u6574\u8fd9\u4e2a\u503c\uff0c\u4f60\u53ef\u4ee5\u66f4\u597d\u5730\u5904\u7406\u7f51\u7edc\u5ef6\u8fdf\u548c\u4e0d\u7a33\u5b9a\u7684\u95ee\u9898\uff0c\u786e\u4fdd NCCL \u901a\u4fe1\u7684\u7a33\u5b9a\u6027\u548c\u53ef\u9760\u6027\u3002\u5982\u679c\u5728\u96c6\u4f53\u901a\u4fe1\u8fc7\u7a0b\u4e2d\u9047\u5230\u8d85\u65f6\u95ee\u9898\uff0c\u53ef\u4ee5\u5c1d\u8bd5\u8c03\u6574\u6b64\u73af\u5883\u53d8\u91cf\u4ee5\u89e3\u51b3\u95ee\u9898\u3002<\/li><\/ol>\n\n\n\n<p><strong>\u8bbe\u7f6e\u8d85\u65f6\u65f6\u95f4<\/strong>:<\/p>\n\n\n\n<ul><li><code>NCCL_TIMEOUT<\/code> \u7528\u4e8e\u5b9a\u4e49 NCCL \u96c6\u4f53\u901a\u4fe1\u64cd\u4f5c\u7684\u8d85\u65f6\u65f6\u95f4\u3002\u8d85\u65f6\u65f6\u95f4\u662f NCCL \u5728\u6267\u884c\u64cd\u4f5c\u65f6\u7b49\u5f85\u54cd\u5e94\u7684\u6700\u957f\u65f6\u95f4\uff0c\u8d85\u51fa\u6b64\u65f6\u95f4\u5c06\u89e6\u53d1\u8d85\u65f6\u9519\u8bef\u3002<\/li><\/ul>\n\n\n\n<p><strong>\u89e3\u51b3\u7f51\u7edc\u95ee\u9898<\/strong>:<\/p>\n\n\n\n<ul><li>\u5728\u9ad8\u6027\u80fd\u8ba1\u7b97\u548c\u5927\u89c4\u6a21\u5206\u5e03\u5f0f\u8bad\u7ec3\u4e2d\uff0c\u7f51\u7edc\u5ef6\u8fdf\u6216\u4e0d\u7a33\u5b9a\u53ef\u80fd\u5bfc\u81f4\u96c6\u4f53\u901a\u4fe1\u64cd\u4f5c\u8d85\u65f6\u3002\u8bbe\u7f6e\u5408\u9002\u7684 <code>NCCL_TIMEOUT<\/code> \u53ef\u4ee5\u5e2e\u52a9\u8c03\u8282\u5bb9\u9519\u8bbe\u7f6e\uff0c\u907f\u514d\u8bad\u7ec3\u8fc7\u7a0b\u4e2d\u56e0\u8d85\u65f6\u9519\u8bef\u800c\u4e2d\u65ad\u3002<\/li><\/ul>\n\n\n\n<p><strong>\u6027\u80fd\u8c03\u4f18<\/strong>:<\/p>\n\n\n\n<ul><li>\u6839\u636e\u4f60\u7684\u96c6\u7fa4\u914d\u7f6e\u548c\u7f51\u7edc\u72b6\u51b5\uff0c\u9002\u5f53\u8c03\u6574 <code>NCCL_TIMEOUT<\/code> \u53ef\u4ee5\u5e2e\u52a9\u4f18\u5316\u901a\u4fe1\u6027\u80fd\u548c\u7a33\u5b9a\u6027\u3002<\/li><\/ul>\n\n\n\n<ol><li>NCCL_ALGO:\u9009\u62e9\u96c6\u5408\u901a\u4fe1\u7b97\u6cd5,\u5982Ring, Tree\uff1b\u4e0d\u540c\u62d3\u6251\u9002\u5408\u4e0d\u540c\u7b97\u6cd5,\u6d4b\u8bd5\u9009\u66f4\u4f18\u7b97\u6cd5<\/li><li>NCCL_CHUNK_SIZE:\u5b9a\u4e49\u73af\u5f62\u4f20\u8f93\u7f13\u51b2\u533a\u5927\u5c0f\uff1b\u5408\u7406\u8bbe\u7f6e\u53ef\u63d0\u901f,\u4f46\u4e5f\u4f1a\u589e\u52a0\u5185\u5b58\u6d88\u8017<\/li><li>NCCL_DEBUG:\u6253\u5f00NCCL\u8c03\u8bd5\u65e5\u5fd7\uff1b\u51fa\u73b0\u95ee\u9898\u65f6\u6253\u5f00\u8c03\u8bd5,\u4f46\u4f1a\u964d\u4f4e\u901f\u5ea6,\u4e0d\u8981\u5728\u751f\u4ea7\u73af\u5883\u4f7f\u7528<\/li><li>NCCL_DEBUG_FILE\u8bbe\u7f6e\u4e00\u4e2a\u6587\u4ef6\u5730\u5740\uff0c\u53d8\u91cf\u7528\u4e8e\u5c06NCCL\u7684\u8c03\u8bd5\u65e5\u5fd7\u8f93\u51fa\u5230\u6587\u4ef6\u4e2d\u3002\u6709\u52a9\u4e8e\u8c03\u8bd5nccl\u3002<\/li><li>NCCL_P2P_LEVEL:\u8bbe\u7f6e\u70b9\u5bf9\u70b9\u901a\u4fe1\u4f18\u5316\u7ea7\u522b\uff1b\u589e\u52a0\u8be5\u503c\u53ef\u51cf\u5c11P2P\u6b21\u6570,\u63d0\u9ad8\u67d0\u4e9b\u64cd\u4f5c\u6548\u7387<\/li><li><strong>NCCL_P2P_DISABLE:\u7981\u7528\u70b9\u5bf9\u70b9\u901a\u4fe1,\u5f3a\u5236\u4f7f\u7528\u96c6\u5408\u901a\u4fe1\u3002<\/strong>\u5728\u67d0\u4e9b\u60c5\u51b5\u4e0b\uff0cP2P \u901a\u4fe1\u53ef\u80fd\u4f1a\u5bfc\u81f4\u6027\u80fd\u95ee\u9898\u6216\u51fa\u73b0\u9519\u8bef\u3002\u7981\u7528 P2P \u901a\u4fe1\u53ef\u4ee5\u5e2e\u52a9\u89e3\u51b3\u8fd9\u4e9b\u95ee\u9898\u3002\u5982\u679c\u4f60\u9047\u5230\u4e0e P2P \u901a\u4fe1\u76f8\u5173\u7684\u9519\u8bef\u6216\u4e0d\u7a33\u5b9a\u6027\uff0c\u7981\u7528 P2P \u53ef\u80fd\u6709\u52a9\u4e8e\u6062\u590d\u7cfb\u7edf\u7684\u7a33\u5b9a\u6027\u3002<\/li><li>NCCL_PXN_DISABLE\uff1a\u7981\u7528\u4f7f\u7528\u975e\u672c\u5730 NIC \u7684\u8282\u70b9\u95f4\u901a\u4fe1\uff0c\u4f7f\u7528 NVLink \u548c\u4e00\u4e2a\u4e2d\u95f4 GPU\u3002\u5efa\u8bae\u8bbe\u7f6e\u62101\u3002\u5728PyTorch\u4e2d\u8fdb\u884c\u8de8\u8282\u70b9all-to-all\u901a\u4fe1\u65f6\uff0c\u5982\u679c\u8be5\u73af\u5883\u53d8\u91cf\u662f0\u4f1a\u51fa\u73b0\u5f02\u5e38\u3002<\/li><li>NCCL_SOCKET_IFNAME:\u9009\u62e9\u7f51\u7edc\u63a5\u53e3\u3002<\/li><li>NCCL_SOCKET_NTHREADS \u589e\u52a0\u5b83\u7684\u6570\u91cf\u53ef\u4ee5\u63d0\u9ad8socker\u4f20\u8f93\u7684\u6548\u7387\uff0c\u4f46\u662f\u4f1a\u589e\u52a0CPU\u7684\u8d1f\u62c5<\/li><li>NCCL_NET_GDR_LEVEL:\u8bbe\u7f6eGPUDirect RDMA\u7684\u4f7f\u7528\u7ea7\u522b\u3002<\/li><li>NCCL_MAX_NRINGS:\u5b9a\u4e49\u652f\u6301\u7684\u6700\u5927NCCL\u73af\u8def\u6570\u3002<\/li><li>NCCL_MIN_NRINGS:\u5b9a\u4e49\u6700\u5c0f\u73af\u8def\u6570\u3002<\/li><li>NCCL_BUFFSIZE:\u8bbe\u7f6escratch\u7a7a\u95f4\u5927\u5c0f\u3002<\/li><li>NCCL_BUFFLE_SIZE \u7f13\u5b58\u6570\u636e\u91cf\uff0c\u7f13\u5b58\u8d8a\u5927\u4e00\u6b21ring\u4f20\u8f93\u7684\u6570\u636e\u5c31\u8d8a\u5927\u81ea\u7136\u5bf9\u5e26\u5bbd\u7684\u538b\u529b\u6700\u5927\uff0c\u4f46\u662f\u76f8\u5e94\u7684\u603b\u5ef6\u8fdf\u6b21\u6570\u4f1a\u5c11\u3002\u9ed8\u8ba4\u503c\u662f4M\uff084194304\uff09\uff0c\u6ce8\u610f\u8bbe\u7f6e\u7684\u65f6\u5019\u4f7f\u7528bytes\uff08\u5b57\u8282\u5927\u5c0f\uff09<\/li><li>NCCL_NTHREADS:\u8bbe\u7f6eNCCL\u5185\u90e8\u4f7f\u7528\u7684\u7ebf\u7a0b\u6570\u3002<\/li><li>NCCL_VERSION:\u663e\u793aNCCL\u7248\u672c\u4fe1\u606f\u3002<\/li><li>NCCL_MAX\/MIN_NCHANNELS \u6700\u5c0f\u548c\u6700\u5927\u7684rings\uff0crings\u8d8a\u591a\u5bf9GPU\u7684\u663e\u5b58\u3001\u5e26\u5bbd\u7684\u538b\u529b\u90fd\u8d8a\u5927\uff0c\u4e5f\u4f1a\u5f71\u54cd\u8ba1\u7b97\u6027\u80fd<\/li><li>NCCL_CHECKS_DISABLE \u5728\u6bcf\u6b21\u96c6\u5408\u901a\u4fe1\u8fdb\u884c\u524d\u5bf9\u53c2\u6570\u68c0\u9a8c\u6821\u5bf9\uff0c\u8fd9\u4f1a\u589e\u52a0\u5ef6\u8fdf\u65f6\u95f4\uff0c\u5728\u751f\u4ea7\u73af\u5883\u4e2d\u53ef\u4ee5\u8bbe\u4e3a1.\u9ed8\u8ba4\u662f0<\/li><li>NCCL_CHECK_POINTERS \u5728\u6bcf\u6b21\u96c6\u5408\u901a\u4fe1\u8fdb\u884c\u524d\u5bf9CUDA\u5185\u5b58 \u6307\u9488\u8fdb\u884c\u6821\u9a8c\uff0c\u8fd9\u4f1a\u589e\u52a0\u5ef6\u8fdf\u65f6\u95f4\uff0c\u5728\u751f\u4ea7\u73af\u5883\u4e2d\u53ef\u4ee5\u8bbe\u4e3a1.\u9ed8\u8ba4\u662f0<\/li><li>NCCL_NET_GDR_LEVEL GDR\u89e6\u53d1\u7684\u6761\u4ef6\uff0c\u9ed8\u8ba4\u662f\u5f53GPU\u548cNIC\u6302\u8f7d\u4e00\u4e2aswith\u4e0a\u9762\u65f6\u4f7f\u7528GDR<\/li><li>NCCL_IGNORE_CPU_AFFINITY \u5ffd\u7565CPU\u4e0e\u5e94\u7528\u7684\u4eb2\u548c\u6027\u4f7f\u7528GPU\u4e0enic\u7684\u4eb2\u548c\u6027\u4e3a\u4e3b<\/li><li><strong>NCCL_IB_DISABLE:\u7981\u7528InfiniBand\u4f20\u8f93\u3002<\/strong><\/li><\/ol>\n\n\n\n<p><strong>\u7981\u7528 InfiniBand<\/strong>: \u8bbe\u7f6e <code>NCCL_IB_DISABLE=1<\/code> \u4f1a\u7981\u7528 NCCL \u5728 InfiniBand \u8bbe\u5907\u4e0a\u7684\u4f7f\u7528\u3002\u8fd9\u610f\u5473\u7740 NCCL \u5c06\u4e0d\u4f1a\u5229\u7528 InfiniBand \u7f51\u7edc\u8fdb\u884c\u6570\u636e\u4f20\u8f93\uff0c\u800c\u662f\u56de\u9000\u5230\u5176\u4ed6\u7f51\u7edc\u63a5\u53e3\uff08\u4f8b\u5982\u4ee5\u592a\u7f51\u6216\u5176\u4ed6\u7f51\u7edc\u63a5\u53e3\uff09\u3002<\/p>\n\n\n\n<p><strong>\u8c03\u8bd5\u548c\u517c\u5bb9\u6027<\/strong>: \u7981\u7528 InfiniBand \u53ef\u80fd\u7528\u4e8e\u8c03\u8bd5\u76ee\u7684\uff0c\u6216\u5728\u7cfb\u7edf\u4e2d InfiniBand \u7f51\u7edc\u51fa\u73b0\u95ee\u9898\u65f6\u56de\u9000\u5230\u5176\u4ed6\u7f51\u7edc\u63a5\u53e3\u3002\u5982\u679c\u4f60\u9047\u5230\u4e0e InfiniBand \u76f8\u5173\u7684\u9519\u8bef\u6216\u517c\u5bb9\u6027\u95ee\u9898\uff0c\u7981\u7528 InfiniBand \u53ef\u80fd\u6709\u52a9\u4e8e\u89e3\u51b3\u8fd9\u4e9b\u95ee\u9898\u3002<\/p>\n\n\n\n<ol><li>NCCL_IB_HCA \u4ee3\u8868IB\u4f7f\u7528\u7684\u8bbe\u5907\uff1aMellanox mlx5\u7cfb\u5217\u7684HCA\u8bbe\u5907NCCL_IB_HCA=mlx5 \u4f1a\u9ed8\u8ba4\u8f6e\u8be2\u6240\u6709\u7684\u8bbe\u5907\u3002NCCL_IB_HCA=mlx5_0:1 \u6307\u5b9a\u5176\u4e2d\u4e00\u53f0\u8bbe\u5907\u3002<\/li><li><strong>NCCL_IB_TIMEOUT&nbsp;\u6539\u53d8\u91cf\u7528\u4e8e\u63a7\u5236InfiniBand Verbs\u8d85\u65f6\u3002\u53d6\u503c\u8303\u56f41-22\u3002\u8d85\u65f6\u65f6\u95f4\u7684\u8ba1\u7b97\u516c\u5f0f\u4e3a4.096\u5fae\u79d2 * 2 ^ timeout\uff0c\u6b63\u786e\u7684\u503c\u53d6\u51b3\u4e8e\u7f51\u7edc\u7684\u5927\u5c0f\u3002\u589e\u52a0\u8be5\u503c\u53ef\u4ee5\u5728\u975e\u5e38\u5927\u7684\u7f51\u7edc\u4e0a\u63d0\u4f9b\u5e2e\u52a9\uff0c\u4f8b\u5982 NCCL\u5728\u8c03\u7528ibv_poll_cq\u65f6\u51fa\u73b0\u9519\u8bef12\u65f6\u3002\u5efa\u8bae\u5728\u5927\u6a21\u578b\u8bad\u7ec3\u4efb\u52a1\u4e2d\u8bbe\u7f6e\u6210\u6700\u5927\u503c22\uff0c\u53ef\u4ee5\u51cf\u5c11\u4e0d\u5c11nccl timeout\u5f02\u5e38\u3002\u8bbe\u7f6e\u8d85\u65f6\u65f6\u95f4<\/strong>: <code>NCCL_IB_TIMEOUT<\/code> \u7528\u4e8e\u63a7\u5236 InfiniBand \u7f51\u7edc\u64cd\u4f5c\u7684\u8d85\u65f6\u65f6\u95f4\u3002\u901a\u8fc7\u8c03\u6574\u8fd9\u4e2a\u503c\uff0c\u4f60\u53ef\u4ee5\u63a7\u5236 NCCL \u5728\u9047\u5230\u901a\u4fe1\u5ef6\u8fdf\u6216\u7f51\u7edc\u95ee\u9898\u65f6\u7684\u5bb9\u5fcd\u5ea6\u3002<strong>\u89e3\u51b3\u7f51\u7edc\u95ee\u9898<\/strong>: \u5728\u9ad8\u6027\u80fd\u8ba1\u7b97\u548c\u5927\u89c4\u6a21\u5206\u5e03\u5f0f\u8bad\u7ec3\u4e2d\uff0c\u7f51\u7edc\u5ef6\u8fdf\u6216\u4e0d\u7a33\u5b9a\u53ef\u80fd\u5bfc\u81f4\u8d85\u65f6\u9519\u8bef\u3002\u8c03\u6574 <code>NCCL_IB_TIMEOUT<\/code> \u53ef\u4ee5\u5e2e\u52a9\u4f60\u5728\u9047\u5230\u7f51\u7edc\u95ee\u9898\u65f6\u66f4\u597d\u5730\u8c03\u8282\u8d85\u65f6\u8bbe\u7f6e\uff0c\u907f\u514d\u8bad\u7ec3\u8fc7\u7a0b\u88ab\u4e2d\u65ad\u3002<\/li><\/ol>\n\n\n\n<ol><li>NCCL_IB_RETRY_CNT\u53d8\u91cf\u63a7\u5236 InfiniBand \u7684\u91cd\u8bd5\u6b21\u6570\u3002\u5efa\u8bae\u5728\u5927\u6a21\u578b\u8bad\u7ec3\u4efb\u52a1\u4e2d\u8bbe\u7f6e\u621013\uff0c\u5c3d\u53ef\u80fd\u591a\u91cd\u8bd5\u3002<\/li><li>NCCL_DEBUG_FILE\u8bbe\u7f6e\u4e00\u4e2a\u6587\u4ef6\u5730\u5740\uff0c\u53d8\u91cf\u7528\u4e8e\u5c06NCCL\u7684\u8c03\u8bd5\u65e5\u5fd7\u8f93\u51fa\u5230\u6587\u4ef6\u4e2d\u3002\u6709\u52a9\u4e8e\u8c03\u8bd5nccl\u3002<\/li><li>NCCL_IB_PCI_RELAXED_ORDERING\u542f\u7528 IB Verbs \u4f20\u8f93\u7684Relaxed Ordering\u3002Relaxed Ordering\u53ef\u4ee5\u6781\u5927\u5730\u63d0\u9ad8\u865a\u62df\u5316\u73af\u5883\u4e0b InfiniBand \u7f51\u7edc\u7684\u6027\u80fd\u3002\u8bbe\u7f6e\u4e3a 2\uff0c\u5982\u679c\u53ef\u7528\uff0c\u81ea\u52a8\u4f7f\u7528Relaxed Ordering\u3002\u8bbe\u7f6e\u4e3a 1\uff0c\u5f3a\u5236\u4f7f\u7528Relaxed Ordering\uff0c\u5982\u679c\u4e0d\u53ef\u7528\u5219\u5931\u8d25\u3002\u8bbe\u7f6e\u4e3a 0\uff0c\u7981\u7528\u4f7f\u7528Relaxed Ordering\u3002\u9ed8\u8ba4\u503c\u4e3a 2\u3002\u5efa\u8bae\u503c\u4e3a1<\/li><\/ol>\n\n\n\n<h3>2\u3001\u589e\u52a0 dist.init_process_group  \u8d85\u65f6\u65f6\u95f4\uff0c\u8fd8\u8981\u5bf9\u5e94\u4fee\u6539NCCL\u53d8\u91cf\uff1a <strong>export<\/strong>  <strong>TORCH_NCCL_BLOCKING_WAIT&nbsp;<\/strong>\uff01\uff01<\/h3>\n\n\n\n<p class=\"has-light-gray-background-color has-background\"> dist.init_process_group(backend=kwargs.get(&#8220;backend&#8221;, &#8220;nccl&#8221;), init_method=&#8221;env:\/\/&#8221;,timeout=timedelta(seconds=7200000)) # 7200s \u7b49\u5f852h<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>\n<strong>export<\/strong> <strong>TORCH_NCCL_BLOCKING_WAIT=1<\/strong>  # \u662f\u5426\u5835\u585e\u7b49\u5f85\u67d0\u8282\u70b9\u9519\u8bef\u8d85\u65f6 \u201c0\u201d \u4e0d\u5835\u585e\u7b49\u5f85  \u201c1\u201d \u5835\u585e\u7b49\u5f85\n<strong>echo $TORCH_NCCL_BLOCKING_WAIT\nprintenv TORCH_NCCL_BLOCKING_WAIT  # \u65b0\u7248\u672ctorch<\/strong>\n\n<strong>export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # \u662f\u5426\u5835\u585e\u7b49\u5f85\u67d0\u8282\u70b9\u9519\u8bef\u8d85\u65f6 \u201c0\u201d \u4e0d\u5835\u585e\u7b49\u5f85  \u201c1\u201d \u5835\u585e\u7b49\u5f85\necho $TORCH_NCCL_ASYNC_ERROR_HANDLING\nprintenv TORCH_NCCL_ASYNC_ERROR_HANDLING # \u65b0\u7248\u672ctorch<\/strong>\n\n<strong>export NCCL_BLOCKING_WAIT=1\necho $NCCL_BLOCKING_WAIT\nprintenv NCCL_BLOCKING_WAIT      #\u65e7\u7248\u672ctorch\n\nexport NCCL_ASYNC_ERROR_HANDLING=1\necho $NCCL_ASYNC_ERROR_HANDLING\nprintenv NCCL_ASYNC_ERROR_HANDLING<\/strong>   <strong>#\u65e7\u7248\u672ctorch<\/strong><\/code><\/pre>\n\n\n\n<p>\u5728\u4f7f\u7528 <code>torch.distributed.init_process_group<\/code> \u521d\u59cb\u5316\u5206\u5e03\u5f0f\u8bad\u7ec3\u65f6\uff0c<code>timeout<\/code> \u53c2\u6570\u7528\u4e8e\u6307\u5b9a\u96c6\u7fa4\u4e2d\u8fdb\u7a0b\u4e4b\u95f4\u8fdb\u884c\u96c6\u4f53\u901a\u4fe1\u64cd\u4f5c\u65f6\u7684\u8d85\u65f6\u65f6\u95f4\u3002\u8fd9\u4e2a\u8d85\u65f6\u65f6\u95f4\u51b3\u5b9a\u4e86\u5206\u5e03\u5f0f\u8fdb\u7a0b\u5728\u7b49\u5f85\u5176\u4ed6\u8fdb\u7a0b\u54cd\u5e94\u65f6\u7684\u6700\u957f\u65f6\u95f4\u3002<\/p>\n\n\n\n<p>torch.distributed.init_process_group(<em>backend=None<\/em>,&nbsp;<em>init_method=None<\/em>,&nbsp;<em>timeout=None<\/em>,&nbsp;<em>world_size=-1<\/em>,&nbsp;<em>rank=-1<\/em>,&nbsp;<em>store=None<\/em>,&nbsp;<em>group_name=&#8221;<\/em>,&nbsp;<em>pg_options=None<\/em>,&nbsp;<em>device_id=None<\/em>)<\/p>\n\n\n\n<p>\u8bf4\u660e\u6587\u6863\uff1a<a rel=\"noreferrer noopener\" href=\"https:\/\/pytorch.org\/docs\/stable\/distributed.html\" target=\"_blank\">https:\/\/pytorch.org\/docs\/stable\/distributed.html<\/a><\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"874\" height=\"598\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-12.png\" alt=\"\" class=\"wp-image-16877\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-12.png 874w, http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-12-300x205.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-12-768x525.png 768w\" sizes=\"(max-width: 874px) 100vw, 874px\" \/><figcaption> \u65b0\u7248\u672ctorch <\/figcaption><\/figure>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"847\" height=\"478\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-13.png\" alt=\"\" class=\"wp-image-16903\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-13.png 847w, http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-13-300x169.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-13-768x433.png 768w\" sizes=\"(max-width: 847px) 100vw, 847px\" \/><figcaption>\u65e7\u7248\u672ctorch<\/figcaption><\/figure>\n\n\n\n<p><strong>\u8d85\u65f6\u8bbe\u7f6e<\/strong>:<\/p>\n\n\n\n<ul><li><code>timeout<\/code> \u53c2\u6570\u7528\u4e8e\u8bbe\u7f6e\u5206\u5e03\u5f0f\u901a\u4fe1\u64cd\u4f5c\u7684\u8d85\u65f6\u65f6\u95f4\u3002\u8d85\u65f6\u65f6\u95f4\u662f <code>timedelta<\/code> \u5bf9\u8c61\uff0c\u8868\u793a\u5728\u7b49\u5f85\u5176\u4ed6\u8fdb\u7a0b\u54cd\u5e94\u65f6\u7684\u6700\u957f\u65f6\u95f4\u3002<\/li><li>\u5728\u4f60\u63d0\u4f9b\u7684\u793a\u4f8b\u4e2d\uff0c<code>timeout<\/code> \u88ab\u8bbe\u7f6e\u4e3a <code>timedelta(seconds=108000)<\/code>\uff0c\u5373 30 \u5c0f\u65f6\u3002\u8fd9\u610f\u5473\u7740\u5206\u5e03\u5f0f\u901a\u4fe1\u64cd\u4f5c\u5c06\u5728 30 \u5c0f\u65f6\u5185\u7b49\u5f85\u5176\u4ed6\u8fdb\u7a0b\u54cd\u5e94\u3002<\/li><\/ul>\n\n\n\n<p><strong>\u7528\u9014<\/strong>:<\/p>\n\n\n\n<ul><li><strong>\u5bb9\u9519\u6027<\/strong>: \u63d0\u9ad8\u5bb9\u9519\u6027\uff0c\u786e\u4fdd\u5728\u957f\u65f6\u95f4\u7b49\u5f85\u671f\u95f4\u4e0d\u4f1a\u56e0\u4e3a\u7f51\u7edc\u5ef6\u8fdf\u6216\u901a\u4fe1\u95ee\u9898\u5bfc\u81f4\u8fdb\u7a0b\u5931\u8d25\u3002<\/li><li><strong>\u8c03\u8bd5<\/strong>: \u5728\u8c03\u8bd5\u548c\u6d4b\u8bd5\u4e2d\uff0c\u8bbe\u7f6e\u8f83\u957f\u7684\u8d85\u65f6\u65f6\u95f4\u53ef\u4ee5\u5e2e\u52a9\u8bc6\u522b\u662f\u5426\u56e0\u4e3a\u8d85\u65f6\u8bbe\u7f6e\u8fc7\u77ed\u800c\u5bfc\u81f4\u7684\u901a\u4fe1\u95ee\u9898\u3002<\/li><li><strong>\u9632\u6b62\u6b7b\u9501<\/strong>: \u5728\u590d\u6742\u7684\u5206\u5e03\u5f0f\u8bad\u7ec3\u4efb\u52a1\u4e2d\uff0c\u957f\u65f6\u95f4\u7684\u8d85\u65f6\u65f6\u95f4\u6709\u52a9\u4e8e\u9632\u6b62\u56e0\u901a\u4fe1\u6b7b\u9501\u800c\u5bfc\u81f4\u7684\u8fdb\u7a0b\u5931\u8d25\u3002<\/li><\/ul>\n\n\n\n<p><strong>\u8d85\u65f6\u5904\u7406<\/strong>:<\/p>\n\n\n\n<ul><li>\u5982\u679c\u5728\u6307\u5b9a\u7684\u8d85\u65f6\u65f6\u95f4\u5185\u6ca1\u6709\u6536\u5230\u9884\u671f\u7684\u54cd\u5e94\uff0c<code>init_process_group<\/code> \u5c06\u4f1a\u5f15\u53d1\u8d85\u65f6\u9519\u8bef\u3002\u8fd9\u901a\u5e38\u8868\u793a\u8fdb\u7a0b\u4e4b\u95f4\u7684\u901a\u4fe1\u51fa\u73b0\u4e86\u95ee\u9898\uff0c\u53ef\u80fd\u9700\u8981\u68c0\u67e5\u7f51\u7edc\u8fde\u63a5\u3001\u8fdb\u7a0b\u914d\u7f6e\u6216\u5176\u4ed6\u6f5c\u5728\u95ee\u9898\u3002<\/li><\/ul>\n\n\n\n<p><code>TORCH_NCCL_BLOCKING_WAIT<\/code> \u662f\u4e00\u4e2a\u73af\u5883\u53d8\u91cf\uff0c\u7528\u4e8e\u63a7\u5236 PyTorch \u5728\u4f7f\u7528 NCCL \u540e\u7aef\u65f6\u7684\u901a\u4fe1\u7b49\u5f85\u7b56\u7565\u3002\u5177\u4f53\u6765\u8bf4\uff0c\u5b83\u51b3\u5b9a\u4e86 NCCL \u64cd\u4f5c\u662f\u5426\u4f7f\u7528\u963b\u585e\u7b49\u5f85\u65b9\u5f0f\u6765\u5904\u7406\u901a\u4fe1\u64cd\u4f5c\u3002<\/p>\n\n\n\n<h4><code>TORCH_NCCL_BLOCKING_WAIT<\/code> \u7684\u4f5c\u7528<\/h4>\n\n\n\n<ul><li><strong><code>TORCH_NCCL_BLOCKING_WAIT=1<\/code><\/strong>:<ul><li><strong>\u542f\u7528\u963b\u585e\u7b49\u5f85<\/strong>: \u5f53\u8bbe\u7f6e\u4e3a 1 \u65f6\uff0cPyTorch \u5728\u6267\u884c NCCL \u64cd\u4f5c\uff08\u5982 <code>all-reduce<\/code> \u6216 <code>broadcast<\/code>\uff09\u65f6\uff0c\u4f1a\u4f7f\u7528\u963b\u585e\u7b49\u5f85\u7684\u65b9\u5f0f\u3002\u8fd9\u610f\u5473\u7740 PyTorch \u4f1a\u7b49\u5f85\u64cd\u4f5c\u5b8c\u5168\u5b8c\u6210\u6216\u8d85\u65f6\u4e4b\u540e\u624d\u7ee7\u7eed\u6267\u884c\u3002\u8fd9\u79cd\u8bbe\u7f6e\u53ef\u4ee5\u5e2e\u52a9\u786e\u4fdd\u6240\u6709\u8fdb\u7a0b\u5728\u7ee7\u7eed\u4e4b\u524d\u90fd\u5b8c\u6210\u4e86\u901a\u4fe1\uff0c\u6709\u52a9\u4e8e\u89e3\u51b3\u56e0\u5f02\u6b65\u64cd\u4f5c\u5f15\u8d77\u7684\u6570\u636e\u540c\u6b65\u95ee\u9898\u6216\u9519\u8bef\u3002<\/li><\/ul><\/li><li><strong><code>TORCH_NCCL_BLOCKING_WAIT=0<\/code><\/strong>:<ul><li><strong>\u7981\u7528\u963b\u585e\u7b49\u5f85<\/strong>: \u9ed8\u8ba4\u60c5\u51b5\u4e0b\uff08\u5373\u8bbe\u7f6e\u4e3a 0\uff09\uff0cPyTorch \u4f7f\u7528\u975e\u963b\u585e\u7b49\u5f85\u65b9\u5f0f\u3002NCCL \u64cd\u4f5c\u5728\u540e\u53f0\u5f02\u6b65\u8fdb\u884c\uff0c\u53ef\u80fd\u4f1a\u5bfc\u81f4\u5728\u64cd\u4f5c\u5b8c\u6210\u4e4b\u524d\u7a0b\u5e8f\u7ee7\u7eed\u6267\u884c\u3002\u8fd9\u79cd\u65b9\u5f0f\u53ef\u80fd\u4f1a\u5728\u7f51\u7edc\u5ef6\u8fdf\u6216\u7cfb\u7edf\u8d1f\u8f7d\u8f83\u9ad8\u65f6\u5f15\u53d1\u901a\u4fe1\u8d85\u65f6\u6216\u6570\u636e\u4e0d\u4e00\u81f4\u7684\u95ee\u9898\u3002<\/li><\/ul><\/li><\/ul>\n\n\n\n<h4>\u5982\u4f55\u8bbe\u7f6e <code>TORCH_NCCL_BLOCKING_WAIT<\/code><\/h4>\n\n\n\n<p>\u4f60\u53ef\u4ee5\u901a\u8fc7\u4ee5\u4e0b\u65b9\u5f0f\u8bbe\u7f6e <code>TORCH_NCCL_BLOCKING_WAIT<\/code> \u73af\u5883\u53d8\u91cf\uff1a<\/p>\n\n\n\n<ol><li><strong>\u4e34\u65f6\u8bbe\u7f6e<\/strong>: \u5728\u8fd0\u884c\u7a0b\u5e8f\u65f6\uff0c\u53ef\u4ee5\u5728\u547d\u4ee4\u884c\u4e2d\u4e34\u65f6\u8bbe\u7f6e\u73af\u5883\u53d8\u91cf\uff1abash\u590d\u5236\u4ee3\u7801<code>TORCH_NCCL_BLOCKING_WAIT=1 python your_training_script.py<\/code><\/li><li><strong>\u6c38\u4e45\u8bbe\u7f6e<\/strong>: \u5728\u7ec8\u7aef\u4f1a\u8bdd\u4e2d\uff0c\u53ef\u4ee5\u901a\u8fc7 export \u547d\u4ee4\u6c38\u4e45\u8bbe\u7f6e\uff1abash\u590d\u5236\u4ee3\u7801<code>export TORCH_NCCL_BLOCKING_WAIT=1 <\/code>\u8fd9\u4e2a\u8bbe\u7f6e\u4f1a\u5728\u5f53\u524d\u7ec8\u7aef\u4f1a\u8bdd\u4e2d\u751f\u6548\uff0c\u76f4\u5230\u4f1a\u8bdd\u7ed3\u675f\u6216\u91cd\u65b0\u542f\u52a8\u3002<\/li><li><strong>\u5728\u811a\u672c\u4e2d\u8bbe\u7f6e<\/strong>: \u5982\u679c\u4f60\u5e0c\u671b\u5728 Python \u811a\u672c\u5185\u90e8\u8bbe\u7f6e\u8fd9\u4e2a\u53d8\u91cf\uff0c\u53ef\u4ee5\u5728\u811a\u672c\u7684\u5f00\u5934\u6dfb\u52a0\uff1apython\u590d\u5236\u4ee3\u7801<code>import os os.environ['TORCH_NCCL_BLOCKING_WAIT'] = '1'<\/code><\/li><\/ol>\n\n\n\n<h4>\u4f7f\u7528\u573a\u666f<\/h4>\n\n\n\n<ul><li><strong>\u8c03\u8bd5\u548c\u7a33\u5b9a\u6027<\/strong>:<ul><li>\u542f\u7528\u963b\u585e\u7b49\u5f85\u6709\u52a9\u4e8e\u8c03\u8bd5\u548c\u89e3\u51b3 NCCL \u64cd\u4f5c\u4e2d\u7684\u540c\u6b65\u95ee\u9898\u3002\u5b83\u786e\u4fdd\u6240\u6709\u901a\u4fe1\u64cd\u4f5c\u5b8c\u6210\u540e\u624d\u7ee7\u7eed\u6267\u884c\uff0c\u6709\u52a9\u4e8e\u63d0\u9ad8\u7cfb\u7edf\u7684\u7a33\u5b9a\u6027\u3002<\/li><\/ul><\/li><li><strong>\u7f51\u7edc\u4e0d\u7a33\u5b9a\u548c\u8d1f\u8f7d\u9ad8<\/strong>:<ul><li>\u5728\u7f51\u7edc\u5ef6\u8fdf\u8f83\u9ad8\u6216\u7cfb\u7edf\u8d1f\u8f7d\u8f83\u5927\u7684\u73af\u5883\u4e2d\uff0c\u542f\u7528\u963b\u585e\u7b49\u5f85\u53ef\u4ee5\u51cf\u5c11\u7531\u4e8e\u5f02\u6b65\u64cd\u4f5c\u5bfc\u81f4\u7684\u8d85\u65f6\u548c\u9519\u8bef\u3002<\/li><\/ul><\/li><\/ul>\n\n\n\n<h4>\u6ce8\u610f\u4e8b\u9879<\/h4>\n\n\n\n<ul><li><strong>\u6027\u80fd\u5f71\u54cd<\/strong>:<ul><li>\u963b\u585e\u7b49\u5f85\u53ef\u80fd\u4f1a\u589e\u52a0\u901a\u4fe1\u64cd\u4f5c\u7684\u7b49\u5f85\u65f6\u95f4\uff0c\u5f71\u54cd\u6574\u4f53\u8bad\u7ec3\u6027\u80fd\uff0c\u7279\u522b\u662f\u5728\u5927\u89c4\u6a21\u5206\u5e03\u5f0f\u8bad\u7ec3\u4efb\u52a1\u4e2d\u3002<\/li><\/ul><\/li><li><strong>\u8d85\u65f6\u95ee\u9898<\/strong>:<ul><li>\u5982\u679c\u8d85\u65f6\u65f6\u95f4\u8bbe\u7f6e\u8fc7\u77ed\u6216\u7f51\u7edc\u72b6\u51b5\u8f83\u5dee\uff0c\u542f\u7528\u963b\u585e\u7b49\u5f85\u53ef\u80fd\u5bfc\u81f4\u66f4\u591a\u7684\u8d85\u65f6\u9519\u8bef\u3002\u56e0\u6b64\uff0c\u9700\u8981\u5e73\u8861\u7a33\u5b9a\u6027\u548c\u6027\u80fd\u3002<\/li><\/ul><\/li><\/ul>\n\n\n\n<h4>\u603b\u7ed3<\/h4>\n\n\n\n<p><code>TORCH_NCCL_BLOCKING_WAIT<\/code> \u73af\u5883\u53d8\u91cf\u63a7\u5236 PyTorch \u4f7f\u7528 NCCL \u540e\u7aef\u65f6\u7684\u901a\u4fe1\u7b49\u5f85\u7b56\u7565\u3002\u8bbe\u7f6e\u4e3a 1 \u53ef\u4ee5\u542f\u7528\u963b\u585e\u7b49\u5f85\uff0c\u6709\u52a9\u4e8e\u63d0\u9ad8\u7cfb\u7edf\u7a33\u5b9a\u6027\u548c\u8c03\u8bd5\u80fd\u529b\uff0c\u4f46\u53ef\u80fd\u4f1a\u5f71\u54cd\u6027\u80fd\u3002\u6839\u636e\u5177\u4f53\u7684\u8bad\u7ec3\u4efb\u52a1\u548c\u73af\u5883\uff0c\u53ef\u4ee5\u9009\u62e9\u5408\u9002\u7684\u8bbe\u7f6e\u6765\u4f18\u5316\u8bad\u7ec3\u8fc7\u7a0b\u3002<\/p>\n\n\n\n<p class=\"has-light-gray-background-color has-background\"><strong>\u76f8\u5173\u73af\u5883\u53d8\u91cf\u89e3\u91ca:<\/strong><\/p>\n\n\n\n<p class=\"has-light-pink-background-color has-background\"><a href=\"https:\/\/pytorch.org\/docs\/stable\/torch_nccl_environment_variables.html\">https:\/\/pytorch.org\/docs\/stable\/torch_nccl_environment_variables.html<\/a><\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"1024\" height=\"695\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-41-1024x695.png\" alt=\"\" class=\"wp-image-17209\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-41-1024x695.png 1024w, http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-41-300x204.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-41-768x521.png 768w, http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-41.png 1036w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"843\" height=\"858\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-42.png\" alt=\"\" class=\"wp-image-17210\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-42.png 843w, http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-42-295x300.png 295w, http:\/\/139.9.1.231\/wp-content\/uploads\/2024\/08\/image-42-768x782.png 768w\" sizes=\"(max-width: 843px) 100vw, 843px\" \/><\/figure>\n\n\n\n<h3>3\u3001\u589e\u52a0 num_workers \u6765\u52a0\u5feb\u5904\u7406\u6570\u636e\u3010<code>D<\/code>ataloader\u9636\u6bb5\u5bfc\u81f4 NCCL\u8d85\u65f6\u3011<\/h3>\n\n\n\n<p>\u5982\u679c\u662f\u5728\u6570\u636e\u52a0\u8f7d\u7684\u65f6\u95f4\u8fc7\u957f\uff0c\u5bfc\u81f4NCCL\u901a\u4fe1\u8d85\u65f6\uff0c\u8003\u8651\u589e\u52a0num_workers\u6765\u63d0\u9ad8\u6570\u636e\u52a0\u8f7d\u901f\u5ea6\u3002<\/p>\n\n\n\n<p><strong>\u51cf\u5c11\u6570\u636e\u52a0\u8f7d\u74f6\u9888<\/strong>:<\/p>\n\n\n\n<ul><li>\u589e\u52a0 <code>num_workers<\/code> \u53ef\u4ee5\u63d0\u9ad8\u6570\u636e\u52a0\u8f7d\u901f\u5ea6\uff0c\u51cf\u5c11\u8bad\u7ec3\u8fc7\u7a0b\u4e2d\u56e0\u6570\u636e\u52a0\u8f7d\u800c\u5bfc\u81f4\u7684\u7b49\u5f85\u65f6\u95f4\u3002\u8fd9\u53ef\u4ee5\u95f4\u63a5\u51cf\u5c11\u7531\u4e8e\u6570\u636e\u5904\u7406\u7f13\u6162\u800c\u53ef\u80fd\u5f15\u53d1\u7684 NCCL \u8d85\u65f6\u95ee\u9898\u3002<\/li><\/ul>\n\n\n\n<p><strong>\u63d0\u9ad8\u8bad\u7ec3\u6548\u7387<\/strong>:<\/p>\n\n\n\n<ul><li>\u66f4\u9ad8\u6548\u7684\u6570\u636e\u52a0\u8f7d\u53ef\u4ee5\u63d0\u9ad8\u6574\u4f53\u8bad\u7ec3\u6548\u7387\uff0c\u4f7f\u8bad\u7ec3\u8fc7\u7a0b\u66f4\u52a0\u987a\u7545\uff0c\u4ece\u800c\u53ef\u80fd\u51cf\u5c11\u7531\u4e8e\u7cfb\u7edf\u8d1f\u8f7d\u4e0d\u5747\u5bfc\u81f4\u7684\u901a\u4fe1\u8d85\u65f6\u95ee\u9898\u3002<\/li><\/ul>\n\n\n\n<h3>4\u3001  <code>DistributedSampler<\/code>  \u91c7\u6837\u9636\u6bb5\u5bfc\u81f4 NCCL\u8d85\u65f6\uff1a<\/h3>\n\n\n\n<p>\u5982\u679c\u5206\u5e03\u5f0f\u8bad\u7ec3\u4e2d NCCL \u8d85\u65f6\u95ee\u9898\u53d1\u751f\u5728\u91c7\u6837\u9636\u6bb5\uff08\u7279\u522b\u662f\u5728\u4f7f\u7528 <code>DistributedSampler<\/code> \u6216\u81ea\u5b9a\u4e49\u7684\u91c7\u6837\u5668\u65f6\uff09\uff0c\u53ef\u80fd\u8868\u660e\u5b58\u5728\u67d0\u4e9b\u6f5c\u5728\u7684\u95ee\u9898\uff0c\u8fd9\u4e9b\u95ee\u9898\u53ef\u80fd\u5bfc\u81f4\u8bad\u7ec3\u8fdb\u7a0b\u4e4b\u95f4\u7684\u540c\u6b65\u6216\u6570\u636e\u4f20\u8f93\u6548\u7387\u4f4e\u4e0b\u3002\u4ee5\u4e0b\u662f\u4e00\u4e9b\u53ef\u80fd\u7684\u539f\u56e0\u548c\u89e3\u51b3\u65b9\u6cd5\uff1a<\/p>\n\n\n\n<p><strong>\u53ef\u80fd\u7684\u539f\u56e0<\/strong>\uff1a<\/p>\n\n\n\n<ol><li><strong>\u6570\u636e\u52a0\u8f7d\u548c\u91c7\u6837\u901f\u5ea6\u95ee\u9898<\/strong>:<ul><li>\u5982\u679c\u91c7\u6837\u5668\u7684\u6027\u80fd\u4e0d\u4f73\uff0c\u53ef\u80fd\u4f1a\u5bfc\u81f4\u6570\u636e\u52a0\u8f7d\u901f\u5ea6\u53d8\u6162\uff0c\u4ece\u800c\u5f71\u54cd\u8bad\u7ec3\u8fc7\u7a0b\u3002\u867d\u7136\u8fd9\u4e0d\u4f1a\u76f4\u63a5\u5bfc\u81f4 NCCL \u8d85\u65f6\uff0c\u4f46\u5b83\u4f1a\u95f4\u63a5\u5f71\u54cd\u6574\u4f53\u8bad\u7ec3\u6027\u80fd\u3002<\/li><\/ul><\/li><li><strong>\u8fdb\u7a0b\u540c\u6b65\u95ee\u9898<\/strong>:<ul><li>\u5728\u4f7f\u7528 <code>DistributedSampler<\/code> \u65f6\uff0c\u6240\u6709\u8fdb\u7a0b\u9700\u8981\u540c\u6b65\u4ee5\u786e\u4fdd\u6570\u636e\u7684\u4e00\u81f4\u6027\u3002\u5982\u679c\u91c7\u6837\u5668\u5728\u67d0\u4e9b\u8fdb\u7a0b\u4e2d\u51fa\u73b0\u5ef6\u8fdf\u6216\u963b\u585e\uff0c\u53ef\u80fd\u4f1a\u5bfc\u81f4\u901a\u4fe1\u8d85\u65f6\u3002<\/li><\/ul><\/li><li><strong>\u6570\u636e\u5206\u5e03\u4e0d\u5747<\/strong>:<ul><li>\u5982\u679c\u6570\u636e\u5206\u5e03\u4e0d\u5747\uff0c\u67d0\u4e9b\u8fdb\u7a0b\u53ef\u80fd\u4f1a\u6bd4\u5176\u4ed6\u8fdb\u7a0b\u5904\u7406\u66f4\u591a\u7684\u6570\u636e\uff0c\u4ece\u800c\u5bfc\u81f4\u901a\u4fe1\u5ef6\u8fdf\u548c\u8d85\u65f6\u95ee\u9898\u3002<\/li><\/ul><\/li><li>\u6570\u636e\u9884\u5904\u7406\u590d\u6742\uff1a\u6570\u636e\u9884\u5904\u7406\u592a\u590d\u6742\uff0c\u4f1a\u5bfc\u81f4\u6570\u636e\u52a0\u8f7d\u8fc7\u6162\uff0c\u4e5f\u6709\u53ef\u80fd\u5bfc\u81f4\u8d85\u65f6<\/li><\/ol>\n\n\n\n<h4>\u89e3\u51b3\u65b9\u6cd5\uff1a<\/h4>\n\n\n\n<ol><li><strong>\u4f18\u5316\u91c7\u6837\u5668\u548c\u6570\u636e\u52a0\u8f7d<\/strong>:<ul><li>\u786e\u4fdd\u81ea\u5b9a\u4e49\u91c7\u6837\u5668\u6216 <code>DistributedSampler<\/code> \u4ee5\u9ad8\u6548\u7684\u65b9\u5f0f\u8fdb\u884c\u6570\u636e\u91c7\u6837\u548c\u5206\u914d\u3002\u4f18\u5316\u6570\u636e\u52a0\u8f7d\u901f\u5ea6\uff0c\u786e\u4fdd\u6bcf\u4e2a\u8fdb\u7a0b\u5728\u91c7\u6837\u65f6\u4e0d\u4f1a\u957f\u65f6\u95f4\u7b49\u5f85\u3002<\/li><li>\u4f7f\u7528 <code>num_workers<\/code> \u8bbe\u7f6e\u5408\u7406\u7684\u6570\u91cf\uff0c\u4ee5\u52a0\u5feb\u6570\u636e\u52a0\u8f7d\u901f\u5ea6\uff0c\u4f46\u8981\u6ce8\u610f CPU \u5185\u5b58\u548c\u7cfb\u7edf\u8d1f\u8f7d\u3002<\/li><\/ul><\/li><li><strong>\u8c03\u6574\u8d85\u65f6\u65f6\u95f4<\/strong>:<ul><li>\u589e\u52a0 <code>NCCL_TIMEOUT<\/code> \u73af\u5883\u53d8\u91cf\u503c\u6216 <code>dist.init_process_group<\/code> \u4e2d\u7684 <code>timeout<\/code> \u53c2\u6570\uff0c\u4ee5\u5141\u8bb8\u66f4\u957f\u7684\u7b49\u5f85\u65f6\u95f4\u3002<\/li><\/ul><\/li><\/ol>\n\n\n\n<h3>4\u3001\u57fa\u4e8eHugingFace\u7684Trainer\u591a\u7ea7\u591a\u5361\u8bad\u7ec3LLM\u5bfc\u81f4NCCL\u8d85\u65f6<\/h3>\n\n\n\n<ol type=\"1\"><li><strong>\u542f\u52a8\u547d\u4ee4\u524d\u589e\u52a0\u4e86OMP_NUM_THREADS=1 MKL_NUM_THREADS=1\uff0c\u907f\u514d\u591a\u7ebf\u7a0b\u5bfc\u81f4\u6b7b\u9501\uff1b<\/strong><\/li><li><strong>\u53bb\u6389\u4e86\u52a0\u8f7d\u6570\u636e\u65f6\u7684tqdm\uff1b<\/strong><\/li><li><strong>\u8bb0\u5728\u6570\u636e\u7684DataLoader\u7684drop_last\u8bbe\u7f6e\u4e3aTrue\uff0cpin_memory\u8bbe\u7f6e\u4e3aTrue\uff0cnum_workers\u8bbe\u7f6e\u4e3a0\uff1b<\/strong><\/li><li><strong>\u8bbe\u7f6e\u8bad\u7ec3\u6279\u5927\u5c0f\u4e3aauto\/\u8bbe\u7f6e\u5c0f\u4e00\u70b9<\/strong><\/li><\/ol>\n\n\n\n<h3><strong>\u67e5\u9605\u4e86\u4e00\u4e9b\u8d44\u6599<\/strong><\/h3>\n\n\n\n<ol><li><a href=\"https:\/\/blog.csdn.net\/weixin_42001089\/article\/details\/122733667\">pytorch \u591a\u673a\u591a\u5361\u5361\u4f4f\u95ee\u9898\u6c47\u603b<\/a><\/li><li><a href=\"https:\/\/github.com\/pytorch\/pytorch\/issues\/22834\">Script freezes with no output when using DistributedDataParallel<\/a><\/li><li><a href=\"https:\/\/blog.csdn.net\/yyywxk\/article\/details\/106323049\">PyTorch \u8bad\u7ec3\u65f6\u4e2d\u9047\u5230\u7684\u5361\u4f4f\u505c\u4f4f\u7b49\u95ee\u9898<\/a><\/li><li><a href=\"http:\/\/blog.ziyouman.cn\/?id=75\">PyTorch\u8bad\u7ec3\u65f6\uff0cDataloader\u5361\u6b7b\u3001\u6302\u8d77\uff0c\u8dd1\u4e00\u4e2aepoch\u505c\u4e86\uff0c\u95ee\u9898\u89e3\u51b3\u65b9\u6848<\/a><\/li><li><a href=\"https:\/\/github.com\/bubbliiiing\/faster-rcnn-pytorch\/issues\/9\">\u8fd0\u884c\u5f00\u59cb\u8bad\u7ec3\uff0c\u5361\u4f4f\u534a\u5c0f\u65f6\uff0c\u4e00\u76f4\u4e0d\u52a8<\/a><\/li><li><a href=\"https:\/\/www.cvmart.net\/community\/detail\/5973\">\u5173\u4e8e\u70bc\u4e39\uff0c\u4f60\u662f\u5426\u77e5\u9053\u8fd9\u4e9b\u7ec6\u8282\uff1f<\/a><\/li><li><a href=\"https:\/\/github.com\/ultralytics\/yolov5\/issues\/7481\">ultralytics\/yolov5#7481<\/a><\/li><li><a href=\"https:\/\/www.zhihu.com\/question\/512132168\">https:\/\/www.zhihu.com\/question\/512132168<\/a><\/li><li><a href=\"https:\/\/discuss.pytorch.org\/t\/nccl-timed-out-when-using-the-torch-distributed-run\/153276\">https:\/\/discuss.pytorch.org\/t\/nccl-timed-out-when-using-the-torch-distributed-run\/153276<\/a><\/li><li><a href=\"https:\/\/stackoverflow.com\/questions\/69693950\/error-some-nccl-operations-have-failed-or-timed-out\">https:\/\/stackoverflow.com\/questions\/69693950\/error-some-nccl-operations-have-failed-or-timed-out<\/a><\/li><\/ol>\n","protected":false},"excerpt":{"rendered":"<p>\u6df1\u5ea6\u5b66\u4e60\u7684\u53d1\u5c55\u8bc1\u660e\u4e86\u5927\u6570\u636e\u548c\u5927\u6a21\u578b\u7684\u4ef7\u503c\u3002\u65e0\u8bba\u662f\u5728CV\u8fd8\u662fNLP\u9886\u57df\uff0c\u5728\u5927\u89c4\u6a21\u7684\u8ba1\u7b97\u8d44\u6e90\u4e0a\u8bad\u7ec3\u6a21\u578b\u7684\u80fd\u529b\u53d8\u5f97\u65e5 &hellip; <a href=\"http:\/\/139.9.1.231\/index.php\/2024\/08\/06\/ddp-dataloader\/\" class=\"more-link\">\u7ee7\u7eed\u9605\u8bfb<span class=\"screen-reader-text\">DDP\u5206\u5e03\u5f0f\u8bad\u7ec3&#8211;\u6570\u636e\u52a0\u8f7d\u548c\u8bad\u7ec3NCCL<\/span><\/a><\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":[],"categories":[11,39],"tags":[],"_links":{"self":[{"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/posts\/16556"}],"collection":[{"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/comments?post=16556"}],"version-history":[{"count":303,"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/posts\/16556\/revisions"}],"predecessor-version":[{"id":23439,"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/posts\/16556\/revisions\/23439"}],"wp:attachment":[{"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/media?parent=16556"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/categories?post=16556"},{"taxonomy":"post_tag","embeddable":true,"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/tags?post=16556"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}