{"id":24325,"date":"2025-02-05T09:32:00","date_gmt":"2025-02-05T01:32:00","guid":{"rendered":"http:\/\/139.9.1.231\/?p=24325"},"modified":"2025-07-20T09:28:38","modified_gmt":"2025-07-20T01:28:38","slug":"llm-alignment-techniques-rlhf-rlaif-ppo-dpo-and-more","status":"publish","type":"post","link":"http:\/\/139.9.1.231\/index.php\/2025\/02\/05\/llm-alignment-techniques-rlhf-rlaif-ppo-dpo-and-more\/","title":{"rendered":"LLM\u8bad\u7ec3-\u4eba\u5de5\u5f3a\u5316\u53cd\u9988\u5bf9\u9f50\u7b97\u6cd5\uff1aRLHF, RLAIF, PPO, DPO and More"},"content":{"rendered":"\n<p><strong>\u53c2\u8003\u8bba\u6587\uff1a<a rel=\"noreferrer noopener\" href=\"https:\/\/arxiv.org\/pdf\/2407.16216\" target=\"_blank\">A Comprehensive Survey of LLM Alignment Techniques: RLHF, RLAIF, PPO, DPO and More  <\/a><\/strong><\/p>\n\n\n\n<ul><li><strong><em>\u76f8\u5173\u535a\u5ba2\uff1a<a rel=\"noreferrer noopener\" href=\"https:\/\/wqw547243068.github.io\/rlhf\" target=\"_blank\">https:\/\/wqw547243068.github.io\/rlhf<\/a><\/em><\/strong><\/li><li><strong><em><a href=\"https:\/\/wqw547243068.github.io\/rlhf#%E6%80%9D%E8%80%83-1\">\u91cd\u8981\uff1ahttps:\/\/wqw547243068.github.io\/rlh<\/a>f<\/em><\/strong><\/li><\/ul>\n\n\n\n<p><strong>\u53c2\u8003\u4ee3\u7801\uff1a<\/strong><\/p>\n\n\n\n<ul class=\"has-light-blue-background-color has-background\"><li><strong><a rel=\"noreferrer noopener\" href=\"https:\/\/github.com\/PKU-Alignment\/align-anything\" target=\"_blank\">Align Anything: Training All-modality Model with Feedback<\/a><\/strong><\/li><li><a href=\"https:\/\/github.com\/OpenRLHF\/OpenRLHF\/blob\/main\/README_zh.md\"><strong><em>https:\/\/github.com\/OpenRLHF\/OpenRLHF<\/em><\/strong><\/a><\/li><\/ul>\n\n\n\n\n\n<p><strong>\u7aef\u5230\u7aef\u63a8\u7406\u5b66\u4e60\u589e\u5f3a\u63a8\u7406\u80fd\u529b\u65b9\u6cd5<\/strong> <\/p>\n\n\n\n<p>a.\u63a8\u7406\u65f6\u6269\u5c55\uff08Inference-time scaling\uff09\uff1a \u5982\u94fe\u5f0f\u601d\u7ef4\uff08CoT\uff09\u6216\u81ea\u6211\u4e00\u81f4\u6027\uff08Self-Consistency\uff09\uff0c\u4ee5\u589e\u5f3a\u6a21\u578b\u7684\u63a8\u7406\u80fd\u529b\uff1b\u3010cot\uff1a\u6838\u5fc3\u601d\u60f3\u662f\u5c06\u590d\u6742\u95ee\u9898\u5206\u89e3\u4e3a\u4e00\u7cfb\u5217\u53ef\u89e3\u91ca\u7684\u4e2d\u95f4\u6b65\u9aa4\u3002\u901a\u8fc7\u660e\u786e\u7684\u63a8\u7406\u94fe\u6761\uff0c\u6a21\u578b\u80fd\u591f\u9010\u6b65\u89e3\u51b3\u539f\u672c\u53ef\u80fd\u8d85\u51fa\u5176\u76f4\u63a5\u63a8\u7406\u80fd\u529b\u7684\u95ee\u9898\u3002\u601d\u7ef4\u94fe\u65b9\u6cd5\u7279\u522b\u9002\u7528\u4e8e\u6d89\u53ca\u591a\u6b65\u9aa4\u63a8\u7406\u7684\u4efb\u52a1\uff0c\u5982\u6570\u5b66\u9898\u3001\u591a\u91cd\u903b\u8f91\u63a8\u7406\u95ee\u9898\u7b49\u3002 Self-Consistency \u81ea\u6211\u4e00\u81f4\u63d0\u793a\u662f\u5728 CoT \u57fa\u7840\u4e0a\u8fdb\u4e00\u6b65\u4f18\u5316\uff0c\u901a\u8fc7\u91c7\u6837\u591a\u6761\u63a8\u7406\u8def\u5f84\uff0c\u627e\u51fa\u6700\u4e00\u81f4\u7684\u7b54\u6848\u3002\u5b83\u9002\u7528\u4e8e\u5bf9\u7ed3\u679c\u51c6\u786e\u6027\u8981\u6c42\u66f4\u9ad8\u7684\u573a\u666f\uff0c\u907f\u514d\u4e00\u6b21\u6027\u63a8\u7406\u8def\u5f84\u7684\u5076\u7136\u6027\u5bfc\u81f4\u9519\u8bef\u3002\u3011<\/p>\n\n\n\n<p> b.\u7eaf\u5f3a\u5316\u5b66\u4e60\uff08Pure Reinforcement Learning, RL\uff09\uff1a \u901a\u8fc7\u5f3a\u5316\u5b66\u4e60\u8bad\u7ec3\u6a21\u578b\uff0c\u4f7f\u5176\u5728\u6ca1\u6709\u76d1\u7763\u6570\u636e\u7684\u60c5\u51b5\u4e0b\uff0c\u901a\u8fc7\u8bd5\u9519\u5b66\u4e60\u590d\u6742\u4efb\u52a1; \u3010deepseek-R1-zero\u3011<\/p>\n\n\n\n<p> c.\u76d1\u7763\u5fae\u8c03\u7ed3\u5408\u5f3a\u5316\u5b66\u4e60\uff08SFT + RL\uff09\uff1a \u9996\u5148\u5bf9\u6a21\u578b\u8fdb\u884c\u76d1\u7763\u5fae\u8c03\uff0c\u7136\u540e\u4f7f\u7528\u5f3a\u5316\u5b66\u4e60\u8fdb\u884c\u8fdb\u4e00\u6b65\u4f18\u5316\uff0c\u4ee5\u63d0\u9ad8\u6a21\u578b\u7684\u63a8\u7406\u80fd\u529b\u3002\u3010deepseek-R1\u3011<\/p>\n\n\n\n<p> d.\u7eaf\u76d1\u7763\u5fae\u8c03\u548c\u84b8\u998f\uff08Pure Supervised Fine-Tuning and Distillation\uff09\u4ec5\u4f7f\u7528\u76d1\u7763\u5b66\u4e60\u548c\u6a21\u578b\u84b8\u998f\u6280\u672f\u6765\u589e\u5f3a\u6a21\u578b\u7684\u63a8\u7406\u80fd\u529b\u3002\u3010deepseek-R1-distill<strong>\u84b8\u998f\u6a21\u578b<\/strong>\u3011<\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"810\" height=\"844\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-63.png\" alt=\"\" class=\"wp-image-24909\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-63.png 810w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-63-288x300.png 288w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-63-768x800.png 768w\" sizes=\"(max-width: 810px) 100vw, 810px\" \/><\/figure>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"1024\" height=\"572\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-65-1024x572.png\" alt=\"\" class=\"wp-image-24943\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-65-1024x572.png 1024w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-65-300x168.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-65-768x429.png 768w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-65.png 1279w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"1024\" height=\"497\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-1024x497.png\" alt=\"\" class=\"wp-image-24326\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-1024x497.png 1024w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-300x146.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-768x373.png 768w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image.png 1114w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p><strong>\u4e00\u4e2a\u5b8c\u6574\u7684LLM\u8bad\u7ec3\u8fc7\u7a0b\u5305\u542b\u4ee5\u4e0b\u51e0\u6b65\uff1a<\/strong><\/p>\n\n\n\n<ul><li><strong>Model Initialization<\/strong>\uff1a\u52a0\u8f7d\u6a21\u578b\u548c\u5904\u7406\u5668<\/li><li><strong>\u6570\u636e\u51c6\u5907<\/strong>\uff1a\u89e3\u6790\u6570\u636e\u96c6\u5e76\u8bbe\u7f6e\u5176\u683c\u5f0f<\/li><li><strong>\u6a21\u578b\u63a8\u7406<\/strong>\uff1a\u5c06\u6570\u636e\u8f93\u5165\u5230\u6a21\u578b\u4e2d\u5e76\u83b7\u53d6\u8f93\u51fa<\/li><li><strong>\u68af\u5ea6\u66f4\u65b0<\/strong>\uff1a\u6839\u636e<code>\u635f\u5931\u51fd\u6570<\/code>\u66f4\u65b0\u6a21\u578b\u53c2\u6570<\/li><\/ul>\n\n\n\n<div class=\"wp-block-image\"><figure class=\"aligncenter size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-1.png\" alt=\"\" class=\"wp-image-24330\" width=\"385\" height=\"388\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-1.png 576w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-1-298x300.png 298w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-1-150x150.png 150w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-1-120x120.png 120w\" sizes=\"(max-width: 385px) 100vw, 385px\" \/><\/figure><\/div>\n\n\n\n<p>\u5bf9\u9f50\uff08alignment\uff09\u5176\u4f5c\u7528\u5c31\u662f\u8ba9 LLM \u4e0e\u4eba\u7c7b\u7684\u4ef7\u503c\u89c2\u4fdd\u6301\u4e00\u81f4\u3002\u5728\u5bf9\u9f50 LLM \u65b9\u9762\uff0c\u57fa\u4e8e<strong>\u4eba\u7c7b\u53cd\u9988\u7684<mark>\u5f3a\u5316\u5b66\u4e60<\/mark>\uff08RLHF\uff09<\/strong>\u662f\u4e00\u79cd\u7a81\u7834\u6027\u7684\u6280\u672f\u3002\u8be5\u65b9\u6cd5\u50ac\u751f\u4e86 GPT-4\u3001Claude \u548c Gemini \u7b49\u5f3a\u5927\u6a21\u578b\u3002RLHF \u4e4b\u540e\uff0c\u4eba\u4eec\u4e5f\u63a2\u7d22\u4e86\u591a\u79cd\u591a\u6837\u7684\u5bf9\u9f50 LLM \u7684\u65b9\u6cd5\u3002\u4f46\u662f\uff0c\u6b64\u524d\u8fd8\u6ca1\u6709\u4eba\u5168\u9762\u603b\u7ed3\u5bf9\u9f50 LLM \u4e0e\u4eba\u7c7b\u504f\u597d\u7684\u65b9\u6cd5\u3002<\/p>\n\n\n\n<p>Salesforce \u51b3\u5b9a\u586b\u8865\u8fd9\u4e00\u7a7a\u767d\uff0c\u4e8e\u8fd1\u65e5\u53d1\u5e03\u4e86\u4e00\u4efd 37 \u9875\u7684\u7efc\u8ff0\u62a5\u544a\uff0c\u5176\u4e2d\u6309\u7c7b\u522b\u603b\u7ed3\u4e86\u73b0\u6709\u7684\u7814\u7a76\u6587\u732e\uff0c\u5e76\u8be6\u7ec6\u5206\u6790\u4e86\u5404\u7bc7\u8bba\u6587\u3002<\/p>\n\n\n\n<h2>Introduction<\/h2>\n\n\n\n<p>\u8fd9\u7bc7\u8bba\u6587\u5206\u4e3a\u56db\u5927\u4e3b\u9898\uff1a\u5956\u52b1\u6a21\u578b\u3001\u53cd\u9988\u3001<mark>\u5f3a\u5316\u5b66\u4e60<\/mark>\uff08RL\uff09\u3001\u4f18\u5316\u3002\u6bcf\u4e2a\u4e3b\u9898\u53c8\u5305\u542b\u8fdb\u4e00\u6b65\u7684\u5b50\u4e3b\u9898\uff0c\u5982\u56fe 1 \u6240\u793a\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-6.png\" alt=\"\" class=\"wp-image-24435\" width=\"531\" height=\"315\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-6.png 951w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-6-300x178.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-6-768x456.png 768w\" sizes=\"(max-width: 531px) 100vw, 531px\" \/><figcaption>xPO LLM \u4e0e\u4eba\u7c7b\u504f\u597d\u4fdd\u6301\u4e00\u81f4\u7684 13 \u4e2a\u5206\u7c7b\u65b9\u5411<\/figcaption><\/figure>\n\n\n\n<p>\u5956\u52b1\u6a21\u578b\u7684\u5b50\u4e3b\u9898\u5305\u62ec\uff1a1. \u663e\u5f0f\u5956\u52b1\u6a21\u578b\u4e0e\u9690\u5f0f\u5956\u52b1\u6a21\u578b\uff1b2. \u9010\u70b9\u5956\u52b1\u6a21\u578b\u4e0e\u504f\u597d\u6a21\u578b\uff1b3. \u54cd\u5e94\u5c42\u9762\u7684\u5956\u52b1\u4e0e token \u5c42\u9762\u7684\u5956\u52b1\uff1b4. \u8d1f\u504f\u597d\u4f18\u5316\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"781\" height=\"544\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-7.png\" alt=\"\" class=\"wp-image-24439\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-7.png 781w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-7-300x209.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-7-768x535.png 768w\" sizes=\"(max-width: 781px) 100vw, 781px\" \/><\/figure>\n\n\n\n<p>\u53cd\u9988\u7684\u5b50\u4e3b\u9898\u5305\u62ec\uff1a1. \u504f\u597d\u53cd\u9988\u4e0e\u4e8c\u5143\u53cd\u9988\uff1b2. \u6210\u5bf9\u53cd\u9988\u4e0e\u5217\u8868\u53cd\u9988\uff1b3. \u4eba\u7c7b\u53cd\u9988\u4e0e AI \u53cd\u9988\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"769\" height=\"577\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-8.png\" alt=\"\" class=\"wp-image-24442\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-8.png 769w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-8-300x225.png 300w\" sizes=\"(max-width: 769px) 100vw, 769px\" \/><\/figure>\n\n\n\n<p><mark>\u5f3a\u5316\u5b66\u4e60<\/mark>\u7684\u5b50\u4e3b\u9898\u5305\u62ec\uff1a1. \u57fa\u4e8e\u53c2\u8003\u7684<mark>\u5f3a\u5316\u5b66\u4e60<\/mark>\u4e0e\u65e0\u53c2\u8003\u7684<mark>\u5f3a\u5316\u5b66\u4e60<\/mark>\uff1b2. \u957f\u5ea6\u63a7\u5236\u5f0f<mark>\u5f3a\u5316\u5b66\u4e60<\/mark>\uff1b3.&nbsp;<mark>\u5f3a\u5316\u5b66\u4e60<\/mark>\u4e2d\u7684\u4e0d\u540c\u5206\u652f\uff1b4. \u5728\u7ebf\u7b56\u7565<mark>\u5f3a\u5316\u5b66\u4e60<\/mark>\u4e0e\u79bb\u7ebf\u7b56\u7565<mark>\u5f3a\u5316\u5b66\u4e60<\/mark>\u3002<\/p>\n\n\n\n<p>\u4f18\u5316\u7684\u5b50\u4e3b\u9898\u5305\u62ec\uff1a1. \u5728\u7ebf \/ \u8fed\u4ee3\u5f0f\u504f\u597d\u4f18\u5316\u4e0e\u79bb\u7ebf \/ \u975e\u8fed\u4ee3\u5f0f\u504f\u597d\u4f18\u5316\uff1b2. \u5206\u79bb SFT \u548c\u5bf9\u9f50\u4e0e\u5408\u5e76 SFT \u548c\u5bf9\u9f50\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"951\" height=\"352\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-9.png\" alt=\"\" class=\"wp-image-24444\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-9.png 951w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-9-300x111.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-9-768x284.png 768w\" sizes=\"(max-width: 951px) 100vw, 951px\" \/><\/figure>\n\n\n\n<h2>Individual Paper Reviews in Detail<\/h2>\n\n\n\n<h3><strong>1. RLHF\/PPO<\/strong><\/h3>\n\n\n\n<p>LLM \u7684\u9884\u8bad\u7ec3\u8981\u7528\u5230\u5927\u91cf\u6765\u81ea\u4e0d\u540c\u6765\u6e90\u7684<mark>\u8bed\u6599\u5e93<\/mark>\uff0c\u800c\u8fd9\u672c\u8eab\u5c31\u65e0\u6cd5\u786e\u4fdd\u8fd9\u4e9b\u6570\u636e\u96c6\u7684\u8d28\u91cf\u3002\u6b64\u5916\uff0cLLM \u7684\u4e3b\u8981\u76ee\u6807\u662f\u9884\u6d4b\u4e0b\u4e00\u4e2a token\uff0c\u8fd9\u4e2a\u76ee\u6807\u4e0e\u300c\u6709\u7528\u4e14\u5b89\u5168\u5730\u9075\u4ece\u7528\u6237\u6307\u4ee4\u300d\u7684\u76ee\u6807\u5e76\u4e0d\u4e00\u81f4\u3002\u56e0\u6b64\uff0cLLM \u53ef\u80fd\u4f1a\u8f93\u51fa\u4e0d\u771f\u5b9e\u3001\u6709\u5bb3\u6216\u5bf9\u7528\u6237\u65e0\u7528\u7684\u5185\u5bb9\u3002\u672c\u8d28\u4e0a\u8bb2\uff0c\u8fd9\u4e9b\u6a21\u578b\u5e76\u672a\u4e0e\u7528\u6237\u610f\u56fe\u5bf9\u9f50\u3002RLHF\/PPO \u7684\u4e3b\u8981\u76ee\u6807\u662f\u5728\u5404\u79cd\u4efb\u52a1\u4e0a\u5bf9\u9f50<mark>\u8bed\u8a00\u6a21\u578b<\/mark>\u4e0e\u7528\u6237\u610f\u56fe\uff0c\u5176\u505a\u6cd5\u662f\u4f7f\u7528\u4eba\u7c7b\u53cd\u9988\u6765\u5fae\u8c03\u6a21\u578b\u3002\u6709\u5173\u8fd9\u4e2a\u4e3b\u9898\u7684\u7814\u7a76\u6709\u5f88\u591a\u3002<\/p>\n\n\n\n<h3><strong>2. RLAIF<\/strong><\/h3>\n\n\n\n<p>\u83b7\u53d6\u4eba\u7c7b\u504f\u597d\u6570\u636e\u96c6\u7684\u6210\u672c\u4e0d\u4f4e\uff0c\u56e0\u6b64\u57fa\u4e8e<mark>\u4eba\u5de5\u667a\u80fd<\/mark>\u53cd\u9988\u7684<mark>\u5f3a\u5316\u5b66\u4e60<\/mark>\uff08RLAIF\uff09\u8bde\u751f\u4e86\u3002\u6b64\u5916\uff0c\u968f\u7740 LLM \u7684\u80fd\u529b\u4e0d\u65ad\u8fdb\u6b65\uff0c\u6240\u80fd\u6536\u96c6\u5230\u7684 AI \u504f\u597d\u6570\u636e\u96c6\u7684\u8d28\u91cf\u4e5f\u4e0d\u65ad\u63d0\u9ad8\uff0c\u7531\u6b64\u53ef\u63d0\u5347 LLM \u7684\u5bf9\u9f50\u6548\u679c\u3002<\/p>\n\n\n\n<h3>3.<strong>\u76f4\u63a5\u4eba\u7c7b\u504f\u597d\u4f18\u5316<\/strong><\/h3>\n\n\n\n<p>\u4f20\u7edf RLHF \u65b9\u6cd5\u901a\u5e38\u6d89\u53ca\u5230\u4f18\u5316\u6e90\u81ea\u4eba\u7c7b\u504f\u597d\u7684\u5956\u52b1\u51fd\u6570\u3002\u8be5\u65b9\u6cd5\u867d\u6709\u6548\uff0c\u4f46\u4e5f\u53ef\u80fd\u5e26\u6765\u4e00\u4e9b\u96be\u9898\uff0c\u6bd4\u5982\u589e\u5927\u8ba1\u7b97\u590d\u6742\u5ea6\u4ee5\u53ca\u5728\u4f30\u8ba1\u548c\u4f18\u5316\u5956\u52b1\u65f6\u9700\u8981\u8003\u8651\u504f\u7f6e &#8211; \u65b9\u5dee\u6743\u8861\u3002\u53c2\u9605\u8bba\u6587\u300aHigh-dimensional continuous control using generalized advantage estimation\u300b\u3002<\/p>\n\n\n\n<p>\u8fd1\u671f\u6709\u7814\u7a76\u63a2\u7d22\u4e86\u5176\u5b83\u4e00\u4e9b\u65e8\u5728\u6839\u636e\u4eba\u7c7b\u504f\u597d\uff08\u65e0\u9700\u4f9d\u8d56\u67d0\u4e2a\u6807\u91cf\u7684\u5956\u52b1\u4fe1\u53f7\uff09\u6765\u76f4\u63a5\u4f18\u5316 LLM \u7b56\u7565\u7684\u65b9\u6cd5\u3002<\/p>\n\n\n\n<p>\u8fd9\u4e9b\u65b9\u6cd5\u7684\u76ee\u6807\u662f\u901a\u8fc7\u66f4\u76f4\u63a5\u5730\u4f7f\u7528\u504f\u597d\u6570\u636e\u6765\u7b80\u5316\u5bf9\u9f50\u6d41\u7a0b\u3001\u964d\u4f4e\u8ba1\u7b97\u5f00\u9500\u4ee5\u53ca\u5b9e\u73b0\u66f4\u7a33\u5065\u7684\u4f18\u5316\u3002\u901a\u8fc7\u5c06\u8be5\u95ee\u9898\u63cf\u8ff0\u4e3a\u4e00\u4e2a\u504f\u597d\u4f18\u5316\u95ee\u9898\uff0c\u800c\u4e0d\u662f\u5956\u52b1\u4f30\u8ba1\u548c\u6700\u5927\u5316\u95ee\u9898\uff0c\u8fd9\u4e9b\u65b9\u6cd5\u80fd\u63d0\u4f9b\u4e00\u79cd\u5c06<mark>\u8bed\u8a00\u6a21\u578b<\/mark>\u4e0e\u4eba\u7c7b\u5224\u65ad\u5bf9\u9f50\u7684\u4e0d\u540c\u89c6\u89d2<\/p>\n\n\n\n<h3>4.<strong>token \u7ea7 DPO<\/strong><\/h3>\n\n\n\n<p>\u4f7f\u7528 DPO \u65f6\uff0c\u5956\u52b1\u4f1a\u88ab\u4e00\u8d77\u5206\u914d\u7ed9 prompt \u548c\u54cd\u5e94\u3002\u76f8\u53cd\uff0c\u4f7f\u7528 MDP \u65f6\uff0c\u5956\u52b1\u4f1a\u88ab\u5206\u914d\u7ed9\u5404\u4e2a\u52a8\u4f5c\u3002\u540e\u7eed\u7684\u4e24\u7bc7\u8bba\u6587\u5728 token \u5c42\u9762\u9610\u8ff0\u4e86 DPO \u5e76\u5c06\u5176\u5e94\u7528\u6269\u5c55\u5230\u4e86 token \u7ea7\u7684\u5206\u6790\u3002<\/p>\n\n\n\n<ul><li>DPO \u53ef\u4ee5\u6267\u884c token \u7ea7\u4fe1\u7528\u5206\u914d\u7684\u7814\u7a76\uff0c\u53c2\u9605\u8bba\u6587\u300aFrom r to Q\u2217: Your language model is secretly a Q-function\u300b\uff0c\u62a5\u9053<a href=\"http:\/\/mp.weixin.qq.com\/s?__biz=MzA3MzI4MjgzMw==&amp;mid=2650915526&amp;idx=2&amp;sn=1218e4612e6155527030f7ed7b61fcbe&amp;chksm=84e406b8b3938fae1381190a7bcef69b4f9e3bbf830235938bdf17de964394b7a6b8279d5f0f&amp;scene=21#wechat_redirect\" target=\"_blank\" rel=\"noreferrer noopener\">\u300a\u8fd9\u5c31\u662f OpenAI \u795e\u79d8\u7684 Q*\uff1f\u65af\u5766\u798f\uff1a<mark>\u8bed\u8a00\u6a21\u578b<\/mark>\u5c31\u662f Q \u51fd\u6570\u300b<\/a>\u3002<\/li><li>TDPO\uff0ctoken \u7ea7 DPO\uff0c\u53c2\u9605\u8bba\u6587\u300aToken-level direct preference optimization\u300b\u3002<\/li><\/ul>\n\n\n\n<h3>5.<strong>\u8fed\u4ee3\u5f0f \/ \u5728\u7ebf DPO<\/strong><\/h3>\n\n\n\n<p>\u4f7f\u7528 DPO \u65f6\uff0c\u4f1a\u4f7f\u7528\u6240\u6709\u53ef\u7528\u7684\u504f\u597d\u6570\u636e\u96c6\u6765\u5bf9\u9f50 LLM\u3002\u4e3a\u4e86\u6301\u7eed\u63d0\u5347 LLM\uff0c\u5e94\u5f53\u5b9e\u73b0\u8fed\u4ee3\u5f0f \/ \u5728\u7ebf DPO\u3002\u8fd9\u5c31\u5f15\u51fa\u4e86\u4e00\u4e2a\u6709\u8da3\u7684\u95ee\u9898\uff1a\u5982\u4f55\u9ad8\u6548\u5730\u6536\u96c6\u65b0\u7684\u504f\u597d\u6570\u636e\u96c6\u3002\u4e0b\u9762\u4e24\u7bc7\u8bba\u6587\u6df1\u5165\u63a2\u8ba8\u4e86\u8fd9\u4e00\u4e3b\u9898\u3002<\/p>\n\n\n\n<ul><li>\u81ea\u6211\u5956\u52b1\u5f0f<mark>\u8bed\u8a00\u6a21\u578b<\/mark>\uff0c\u53c2\u9605\u8bba\u6587\u300aSelf-rewarding language models\u300b\u3002<\/li><li>CRINGE\uff0c\u53c2\u9605\u8bba\u6587\u300aThe cringe loss: Learning what language not to model\u300b\u3002<\/li><\/ul>\n\n\n\n<h3>6.<strong>\u4e8c\u5143\u53cd\u9988<\/strong><\/h3>\n\n\n\n<p>\u4e8b\u5b9e\u8bc1\u660e\uff0c\u6536\u96c6\u504f\u597d\u53cd\u9988\u6bd4\u6536\u96c6\u4e8c\u5143\u53cd\u9988\uff08\u6bd4\u5982\u70b9\u8d5e\u6216\u70b9\u8e29\uff09\u7684\u96be\u5ea6\u5927\uff0c\u56e0\u6b64\u540e\u8005\u53ef\u4fc3\u8fdb\u5bf9\u9f50\u8fc7\u7a0b\u7684\u6269\u5c55\u3002KTO \u548c DRO \u8fd9\u4e24\u9879\u7814\u7a76\u5173\u6ce8\u7684\u4fbf\u662f\u4f7f\u7528\u4e8c\u5143\u53cd\u9988\u6765\u5bf9\u9f50 LLM\u3002<\/p>\n\n\n\n<ul><li>KTO\uff0cKahneman-Tversky \u4f18\u5316\uff0c\u53c2\u9605\u8bba\u6587\u300aKTO: Model alignment as prospect theoretic optimization\u300b\u3002<\/li><li>DRO\uff0c\u76f4\u63a5\u5956\u52b1\u4f18\u5316\uff0c\u53c2\u9605\u8bba\u6587\u300aOffline regularised reinforcement learning for large language models alignment\u300b\u3002<\/li><\/ul>\n\n\n\n<h3><strong>7.\u878d\u5408 SFT \u548c\u5bf9\u9f50<\/strong><\/h3>\n\n\n\n<p>\u4e4b\u524d\u7684\u7814\u7a76\u4e3b\u8981\u8fd8\u662f\u6309\u987a\u5e8f\u6267\u884c SFT \u548c\u5bf9\u9f50\uff0c\u4f46\u4e8b\u5b9e\u8bc1\u660e\u8fd9\u79cd\u65b9\u6cd5\u5f88\u8d39\u529b\uff0c\u5e76\u4f1a\u5bfc\u81f4\u707e\u96be\u6027\u9057\u5fd8\u3002\u540e\u7eed\u7684\u7814\u7a76\u6709\u4e24\u4e2a\u65b9\u5411\uff1a\u4e00\u662f\u5c06\u8fd9\u4e24\u4e2a\u8fc7\u7a0b\u6574\u5408\u6210\u5355\u4e00\u6b65\u9aa4\uff1b\u4e8c\u662f\u5e76\u884c\u5730\u5fae\u8c03\u4e24\u4e2a\u6a21\u578b\uff0c\u6700\u7ec8\u518d\u8fdb\u884c\u878d\u5408\u3002<\/p>\n\n\n\n<ul><li>ORPO\uff0c\u6bd4\u503c\u6bd4\u504f\u597d\u4f18\u5316\uff0c\u53c2\u9605\u8bba\u6587\u300aORPO: Monolithic preference optimization without reference model\u300b\u3002<\/li><li>PAFT\uff0c\u5e76\u884c\u5fae\u8c03\uff0c\u53c2\u9605\u8bba\u6587\u300aPAFT: A parallel training paradigm for effective llm fine-tuning\u300b\u3002<\/li><\/ul>\n\n\n\n<h3><strong>8.\u957f\u5ea6\u63a7\u5236\u5f0f DPO \u548c\u65e0\u53c2\u8003 DPO<\/strong><\/h3>\n\n\n\n<p>\u4e4b\u524d\u6709\u7814\u7a76\u8868\u660e\uff0cLLM \u7684\u8f93\u51fa\u5f80\u5f80\u8fc7\u4e8e\u5197\u957f\u3002\u4e3a\u4e86\u89e3\u51b3\u8fd9\u4e2a\u95ee\u9898\uff0cR-DPO \u548c SimPO \u7684\u5173\u6ce8\u91cd\u5fc3\u662f\u5728\u4e0d\u5f71\u54cd\u751f\u6210\u6027\u80fd\u7684\u524d\u63d0\u4e0b\u5b9e\u73b0\u5bf9\u54cd\u5e94\u957f\u5ea6\u7684\u63a7\u5236\u3002<\/p>\n\n\n\n<p>\u6b64\u5916\uff0cDPO \u5fc5\u9700\u53c2\u8003\u7b56\u7565\u6765\u786e\u4fdd\u5df2\u5bf9\u9f50\u6a21\u578b\u4e0d\u4f1a\u4e0e\u53c2\u8003\u6a21\u578b\u6709\u592a\u5927\u504f\u5dee\u3002\u76f8\u8f83\u4e4b\u4e0b\uff0cSimPO \u548c RLOO \u63d0\u51fa\u4e86\u4e00\u4e9b\u65b9\u6cd5\uff0c\u53ef\u4ee5\u5728\u4e0d\u5f71\u54cd LLM \u6548\u679c\u7684\u60c5\u51b5\u4e0b\u6d88\u9664\u5bf9\u53c2\u8003\u6a21\u578b\u7684\u9700\u6c42<\/p>\n\n\n\n<h3><strong>9.\u9010\u5217\u8868\u7684\u504f\u597d\u4f18\u5316<\/strong><\/h3>\n\n\n\n<p>\u4e4b\u524d\u5728 PPO \u548c DPO \u65b9\u9762\u7684\u7814\u7a76\u5173\u6ce8\u7684\u662f\u6210\u5bf9\u504f\u597d\uff0c\u800c RLHF \u65b9\u9762\u7684\u7814\u7a76\u5219\u662f\u6536\u96c6\u9010\u5217\u8868\u7684\u504f\u597d\u6765\u52a0\u901f\u6570\u636e\u6536\u96c6\u8fc7\u7a0b\uff0c\u4e4b\u540e\u518d\u5c06\u5b83\u4eec\u8f6c\u6362\u6210\u6210\u5bf9\u504f\u597d\u3002\u5c3d\u7ba1\u5982\u6b64\uff0c\u4e3a\u4e86\u63d0\u5347 LLM \u7684\u6027\u80fd\uff0c\u76f4\u63a5\u4f7f\u7528\u9010\u5217\u8868\u7684\u6570\u636e\u96c6\u6765\u6267\u884c\u504f\u597d\u4f18\u5316\u662f\u53ef\u884c\u7684\u3002\u4ee5\u4e0b\u4e09\u7bc7\u8bba\u6587\u4e13\u95e8\u8ba8\u8bba\u4e86\u8fd9\u79cd\u65b9\u6cd5\u3002<\/p>\n\n\n\n<ul><li>LiPO\uff0c\u9010\u5217\u8868\u504f\u597d\u4f18\u5316\uff0c\u53c2\u9605\u8bba\u6587\u300aLIPO: Listwise preference optimization through learning-to-rank\u300b\u3002<\/li><li>RRHF\uff0c\u53c2\u9605\u8bba\u6587\u300aRRHF: Rank responses to align language models with human feedback without tears\u300b\u3002<\/li><li>PRO\uff0c\u504f\u597d\u6392\u540d\u4f18\u5316\uff0c\u53c2\u9605\u8bba\u6587\u300aPreference ranking optimization for human alignment\u300b\u3002<\/li><\/ul>\n\n\n\n<h3><strong>10.\u8d1f\u504f\u597d\u4f18\u5316<\/strong><\/h3>\n\n\n\n<p>\u8fd9\u4e9b\u7814\u7a76\u6709\u4e00\u4e2a\u5171\u540c\u524d\u63d0\uff1a\u5f53\u524d\u8fd9\u4e00\u4ee3 LLM \u5df2\u7ecf\u5728\u7ffb\u8bd1\u548c\u603b\u7ed3\u7b49\u4efb\u52a1\u4e0a\u8d85\u8d8a\u4e86\u4eba\u7c7b\u6027\u80fd\u3002\u56e0\u6b64\uff0c\u53ef\u4ee5\u5c06 LLM \u7684\u8f93\u51fa\u89c6\u4e3a\u671f\u671b\u54cd\u5e94\uff0c\u800c\u65e0\u9700\u4f9d\u9760\u5c06\u4eba\u7c7b\u6807\u6ce8\u7684\u6570\u636e\u89c6\u4e3a\u504f\u597d\u54cd\u5e94\uff1b\u8fd9\u6837\u505a\u662f\u6709\u597d\u5904\u7684\u3002\u53cd\u8fc7\u6765\uff0c\u4e0d\u671f\u671b\u5f97\u5230\u7684\u54cd\u5e94\u4f9d\u7136\u4e5f\u53ef\u88ab\u7528\u4e8e\u5bf9\u9f50 LLM\uff0c\u8fd9\u4e2a\u8fc7\u7a0b\u5c31\u662f\u6240\u8c13\u7684\u8d1f\u504f\u597d\u4f18\u5316\uff08NPO\uff09\u3002<\/p>\n\n\n\n<ul><li>NN\uff0c\u5426\u5b9a\u8d1f\u4f8b\u65b9\u6cd5\uff0c\u53c2\u9605\u8bba\u6587\u300aNegating negatives: Alignment without human positive samples via distributional dispreference optimization\u300b\u3002<\/li><li>NPO\uff0c\u8d1f\u4f8b\u504f\u597d\u4f18\u5316\uff0c\u53c2\u9605\u8bba\u6587\u300aNegative preference optimization: From catastrophic collapse to effective unlearning\u300b\u3002<\/li><li>CPO\uff0c\u5bf9\u6bd4\u504f\u597d\u4f18\u5316\uff0c\u53c2\u9605\u8bba\u6587\u300aContrastive preference optimization: Pushing the boundaries of llm performance in machine translation\u300b\u3002<\/li><\/ul>\n\n\n\n<h3><strong>11.\u7eb3\u4ec0\u5b66\u4e60<\/strong><\/h3>\n\n\n\n<p>\u4e4b\u524d\u7684\u7814\u7a76\u901a\u5e38\u662f\u4f7f\u7528\u9010\u70b9\u5956\u52b1\u548c BT \u6a21\u578b\u6765\u5f97\u5230\u6210\u5bf9\u504f\u597d\u3002\u4f46\u662f\uff0c\u8fd9\u79cd\u65b9\u6cd5\u6bd4\u4e0d\u4e0a\u76f4\u63a5\u6210\u5bf9\u504f\u597d\u5efa\u6a21\u5e76\u4e14\u65e0\u6cd5\u89e3\u51b3\u6210\u5bf9\u504f\u597d\u4e2d\u7684\u4e0d\u4e00\u81f4\u95ee\u9898\u3002\u4e3a\u4e86\u514b\u670d\u8fd9\u4e9b\u5c40\u9650\uff0c\u4e00\u4e9b\u7814\u7a76\u63d0\u51fa\u4e86\u7eb3\u4ec0\u5b66\u4e60\u65b9\u6cd5\u3002<\/p>\n\n\n\n<ul><li>\u6839\u636e\u4eba\u7c7b\u53cd\u9988\u7684\u7eb3\u4ec0\u5b66\u4e60\uff0c\u53c2\u9605\u8bba\u6587\u300aNash learning from human feedback\u300b\u3002<\/li><li>SPPO\uff0c\u81ea\u535a\u5f08\u504f\u597d\u4f18\u5316\uff0c\u53c2\u9605\u8bba\u6587\u300aA minimaximalist approach to reinforcement learning from human feedback\u300b\u3002<\/li><li>DNO\uff0c\u76f4\u63a5\u7eb3\u4ec0\u4f18\u5316\uff0c\u53c2\u9605\u8bba\u6587\u300aDirect nash optimization: Teaching language models to self-improve with general preferences\u300b\u3002<\/li><\/ul>\n\n\n\n<h1>LLM \u5bf9\u9f50\uff08Alignment\uff09\u65b9\u6cd5\uff1aSFT\u3001PPO\u3001DPO \u3001ORPOD\u3001GRPO\u7b49\u65b9\u6cd5\u8be6\u7ec6\u4ecb\u7ecd<\/h1>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"1024\" height=\"448\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/v2-79de98bcff7bda5492f1c136aa5fee5a_r-1024x448.jpg\" alt=\"\" class=\"wp-image-24412\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/v2-79de98bcff7bda5492f1c136aa5fee5a_r-1024x448.jpg 1024w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/v2-79de98bcff7bda5492f1c136aa5fee5a_r-300x131.jpg 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/v2-79de98bcff7bda5492f1c136aa5fee5a_r-768x336.jpg 768w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/v2-79de98bcff7bda5492f1c136aa5fee5a_r.jpg 1507w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p>LLM\uff08\u5927\u8bed\u8a00\u6a21\u578b\uff09\u7684\u5bf9\u9f50\uff08Alignment\uff09\u65b9\u6cd5\u65e8\u5728<strong>\u8ba9 AI \u7684\u8f93\u51fa\u66f4\u52a0\u7b26\u5408\u4eba\u7c7b\u9884\u671f<\/strong>\uff0c\u51cf\u5c11\u9519\u8bef\u4fe1\u606f\u3001\u6709\u5bb3\u5185\u5bb9\u6216\u4e0d\u51c6\u786e\u7684\u56de\u7b54\u3002<strong>\u4e3b\u8981\u603b\u7ed3LLM\u8bad\u7ec3\u4e2d\u7684\u57fa\u672c\u7684\u5bf9\u9f50\u7b97\u6cd5\uff0c<em>\u5373<\/em> \u76d1\u7763\u5fae\u8c03 \uff08SFT\uff09\u3001\u76f4\u63a5\u504f\u597d\u4f18\u5316 \uff08DPO\uff09 \u548c\u8fd1\u7aef\u7b56\u7565\u4f18\u5316 \uff08PPO\uff09\u7b49\u3002<\/strong><\/p>\n\n\n\n<h2>SFT\uff08Supervised Fine-Tuning\uff0c\u76d1\u7763\u5fae\u8c03\uff09<\/h2>\n\n\n\n<p>\u76d1\u7763\u5fae\u8c03\uff08SFT\uff09\u662f LLM \u8bad\u7ec3\u4e2d\u7684<strong>\u7b2c\u4e00\u6b65<\/strong>\uff0c\u901a\u8fc7\u9ad8\u8d28\u91cf\u7684\u4eba\u5de5\u6807\u6ce8\u6570\u636e\u96c6\u5bf9\u6a21\u578b\u8fdb\u884c\u5fae\u8c03\uff0c\u4f7f\u5176\u5177\u5907\u57fa\u7840\u7684\u4efb\u52a1\u80fd\u529b\u3002SFT \u662f<strong>\u6240\u6709\u5bf9\u9f50\u65b9\u6cd5\u7684\u57fa\u7840<\/strong>\uff0c\u5982 RLHF\u3001DPO \u7b49\u90fd\u4f9d\u8d56\u4e8e\u4e00\u4e2a\u7ecf\u8fc7 SFT \u8bad\u7ec3\u7684\u6a21\u578b\u4f5c\u4e3a\u521d\u59cb\u72b6\u6001\u3002<\/p>\n\n\n\n<p><strong>\u8fc7\u7a0b<\/strong>\uff1a<\/p>\n\n\n\n<ol><li><strong>\u6570\u636e\u51c6\u5907<\/strong>\uff1a\u6536\u96c6\u9ad8\u8d28\u91cf\u7684\u6307\u4ee4-\u54cd\u5e94\uff08Instruction-Response\uff09\u6570\u636e\u96c6\uff0c\u4f8b\u5982\u4eba\u7c7b\u6807\u6ce8\u7684\u6570\u636e\u6216\u5408\u6210\u7684\u6570\u636e\u3002<\/li><li><strong>\u6a21\u578b\u5fae\u8c03<\/strong>\uff1a\u4f7f\u7528\u4ea4\u53c9\u71b5\u635f\u5931\uff08Cross-Entropy Loss\uff09\u8bad\u7ec3\u6a21\u578b\uff0c\u4f7f\u5176\u5b66\u4e60\u63d0\u4f9b\u4e0e\u6807\u6ce8\u6570\u636e\u5339\u914d\u7684\u7b54\u6848\u3002<\/li><li><strong>\u6548\u679c<\/strong>\uff1a\u4f7f\u6a21\u578b\u5728\u5e38\u89c1\u4efb\u52a1\uff08\u5982\u95ee\u7b54\u3001\u4ee3\u7801\u751f\u6210\u3001\u5bf9\u8bdd\u7b49\uff09\u4e2d\u8868\u73b0\u66f4\u597d\uff0c\u63d0\u9ad8\u5176\u5bf9\u6307\u4ee4\u7684\u9075\u5faa\u80fd\u529b\u3002<\/li><\/ol>\n\n\n\n<p>\u7ed9\u5b9a\u8f93\u5165 x\uff08Prompt\uff09 \u548c\u76ee\u6807\u8f93\u51fa y\uff08Response\uff09\uff0c\u6a21\u578b\u7684\u76ee\u6807\u662f\u6700\u5927\u5316\u751f\u6210\u76ee\u6807\u6587\u672c\u7684\u6982\u7387\uff1a<\/p>\n\n\n\n<div class=\"wp-block-image\"><figure class=\"aligncenter size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-2.png\" alt=\"\" class=\"wp-image-24367\" width=\"222\" height=\"45\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-2.png 375w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-2-300x61.png 300w\" sizes=\"(max-width: 222px) 100vw, 222px\" \/><\/figure><\/div>\n\n\n\n<p>\u5176\u4e2d\uff1a<\/p>\n\n\n\n<ul><li>P\u03b8\u200b(yt\u200b\u2223x,y&lt;t\u200b) \u662f LLM \u5728\u7ed9\u5b9a\u4e0a\u4e0b\u6587\u4e0b\u9884\u6d4b\u4e0b\u4e00\u4e2a token yt\u200b \u7684\u6982\u7387\u3002<\/li><li>\u8bad\u7ec3\u65f6\u91c7\u7528\u4ea4\u53c9\u71b5\u635f\u5931\uff08Cross Entropy Loss\uff09\u6765\u4f18\u5316\u6a21\u578b\u53c2\u6570 \u03b8\u3002<\/li><\/ul>\n\n\n\n<p>SFT \u4ec5\u4f9d\u8d56\u4e8e\u4eba\u5de5\u6807\u6ce8\u6570\u636e\uff0c\u65e0\u6cd5\u8ba9\u6a21\u578b\u5b66\u4e60<strong>\u504f\u597d\u4fe1\u606f<\/strong>\uff08\u6bd4\u5982\u4e0d\u540c\u56de\u7b54\u7684\u4f18\u52a3\uff09\u3002<strong>\u65e0\u6cd5\u52a8\u6001\u8c03\u6574<\/strong>\uff1aSFT \u8bad\u7ec3\u540e\uff0c\u6a21\u578b\u56fa\u5b9a\uff0c\u96be\u4ee5\u9488\u5bf9\u7528\u6237\u53cd\u9988\u8fdb\u884c\u8c03\u6574\u3002<strong>\u7f3a\u4e4f\u63a2\u7d22\u6027<\/strong>\uff1a\u6a21\u578b\u53ea\u4f1a\u5b66\u5230\u8bad\u7ec3\u6570\u636e\u4e2d\u7684\u6a21\u5f0f\uff0c\u65e0\u6cd5\u8fdb\u884c\u5f3a\u5316\u5b66\u4e60\u4f18\u5316\u3002<\/p>\n\n\n\n<h2>DPO\uff08Direct Preference Optimization\uff0c\u76f4\u63a5\u504f\u597d\u4f18\u5316\uff09<\/h2>\n\n\n\n<p>\u8bba\u6587\uff1a<a href=\"https:\/\/arxiv.org\/abs\/2305.18290\"><strong>https:\/\/arxiv.org\/abs\/2305.18290<\/strong><\/a><\/p>\n\n\n\n<p>\u53c2\u8003\u4ee3\u7801\uff1ahttps:\/\/github.com\/eric-mitchell\/direct-preference-optimization<\/p>\n\n\n\n<p>DPO\uff08\u76f4\u63a5\u504f\u597d\u4f18\u5316\uff09\u662f\u4e00\u79cd<strong>\u6bd4 RLHF \u66f4\u7b80\u5355\u3001\u66f4\u9ad8\u6548<\/strong>\u7684\u5bf9\u9f50\u65b9\u6cd5\u3002<br>\u5b83\u4e0d\u9700\u8981\u8bad\u7ec3\u5956\u52b1\u6a21\u578b\uff08RM\uff09\u548c\u4f7f\u7528\u5f3a\u5316\u5b66\u4e60\uff08RL\uff09\uff0c\u800c\u662f<strong>\u76f4\u63a5\u4f18\u5316 LLM\uff0c\u4f7f\u5176\u66f4\u7b26\u5408\u4eba\u7c7b\u504f\u597d\u6570\u636e<\/strong>\u3002<\/p>\n\n\n\n<p><strong>\u504f\u597d\u6570\u636e<\/strong>\uff1a<\/p>\n\n\n\n<ul><li>\u6bcf\u4e2a\u8f93\u5165 Prompt \u5bf9\u5e94<strong>\u4e24\u4e2a\u5019\u9009\u56de\u7b54<\/strong>\uff1a\u4e00\u4e2a<strong>\u4f18\u9009<\/strong>\uff08Preferred y+\uff09\uff0c\u4e00\u4e2a<strong>\u52a3\u9009<\/strong>\uff08Dispreferred y\u2212\uff09\u3002<\/li><li>\u4f8b\u5982\uff1a<\/li><\/ul>\n\n\n\n<p>Prompt: &#8220;\u5982\u4f55\u5199\u4e00\u5c01\u6b63\u5f0f\u7684\u7535\u5b50\u90ae\u4ef6\uff1f&#8221;<br>Response 1 (\u4f18\u9009): &#8220;\u5728\u90ae\u4ef6\u4e2d\u5e94\u4fdd\u6301\u6b63\u5f0f\u8bed\u6c14\uff0c\u5e76\u5305\u542b\u79f0\u547c\u3001\u6b63\u6587\u548c\u7f72\u540d\u3002&#8221;<br>Response 2 (\u52a3\u9009): &#8220;\u968f\u4fbf\u5199\u5c31\u884c\u4e86\uff0c\u4e0d\u8981\u592a\u5728\u610f\u683c\u5f0f\u3002&#8221;<\/p>\n\n\n\n<p><strong>\u4f18\u5316 LLM \u4f7f\u5176\u66f4\u503e\u5411\u4e8e\u4f18\u9009\u56de\u7b54<\/strong>\u3002<\/p>\n\n\n\n<p><strong>\u53ea\u9700\u8981\u52a0\u8f7d2\u4e2a\u76f8\u540c\u7684\u6a21\u578b\uff0c\u5176\u4e2d\u4e00\u4e2a\u63a8\u7406[<\/strong><span style=\"background-color: rgba(51, 51, 51, 0.2); font-weight: 700; font-size: revert;\">reference model\uff1a<\/span><strong style=\"background-color: rgba(51, 51, 51, 0.2); font-size: revert;\">old\u7b56\u7565\u6a21\u578b]\uff0c\u53e6\u5916\u4e00\u4e2a\u6a21\u578b<span style=\"font-size: revert;\">[<\/span><\/strong><span style=\"background-color: rgba(51, 51, 51, 0.2); font-size: revert; font-weight: 700;\">policy model \u7b56\u7565\u6a21\u578b<\/span><strong style=\"background-color: rgba(51, 51, 51, 0.2); font-size: revert;\">]\u8bad\u7ec3\uff0c\u76f4\u63a5\u5728\u504f\u597d\u6570\u636e\u4e0a\u8fdb\u884c\u8bad\u7ec3\u5373\u53ef:<\/strong><\/p>\n\n\n\n<p><strong>Reference Model\uff08\u4ee5\u4e0b\u7b80\u79f0Ref\u6a21\u578b\uff09\u4e00\u822c\u4e5f\u7528SFT\u9636\u6bb5\u5f97\u5230\u7684SFT\u6a21\u578b\u505a\u521d\u59cb\u5316\uff0c\u5728\u8bad\u7ec3\u8fc7\u7a0b\u4e2d\uff0c\u5b83\u7684\u53c2\u6570\u662f\u51bb\u7ed3\u7684\u3002Ref\u6a21\u578b\u7684\u4e3b\u8981\u4f5c\u7528\u662f\u9632\u6b62Actor\u201d\u8bad\u6b6a\u201d<\/strong><\/p>\n\n\n\n<p><strong>\u635f\u5931\u51fd\u6570<\/strong>\uff1a DPO \u76f4\u63a5\u4f18\u5316\u6a21\u578b\u8f93\u51fa\u7684\u504f\u597d\u5206\u5e03\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-50.png\" alt=\"\" class=\"wp-image-24815\" width=\"361\" height=\"52\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-50.png 607w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-50-300x43.png 300w\" sizes=\"(max-width: 361px) 100vw, 361px\" \/><\/figure>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"1024\" height=\"83\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-49-1024x83.png\" alt=\"\" class=\"wp-image-24813\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-49-1024x83.png 1024w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-49-300x24.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-49-768x62.png 768w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-49.png 1084w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p>\u5176\u4e2d\uff1a<\/p>\n\n\n\n<ul><li><em>\u03c3<\/em>&nbsp;\uff1asigmoid\u51fd\u6570<\/li><li><em>\u03b2<\/em>&nbsp;\uff1a\u8d85\u53c2\u6570\uff0c\u4e00\u822c\u57280.1 &#8211; 0.5\u4e4b\u95f4<\/li><li><em>yw<\/em>&nbsp;:\u67d0\u6761\u504f\u597d\u6570\u636e\u4e2d\u597d\u7684response\uff0cw\u5c31\u662fwin\u7684\u610f\u601d<\/li><li><em>yl<\/em>&nbsp;:\u67d0\u6761\u504f\u597d\u6570\u636e\u4e2d\u5dee\u7684response\uff0cl\u5c31\u662floss\u7684\u610f\u601d\uff0c\u6240\u4ee5\u504f\u597d\u6570\u636e\u4e5f\u53ebcomparision data<\/li><li><em>\u03c0<sub>\u03b8<\/sub>(yw|x)&nbsp;<\/em>:\u7ed9\u5b9a\u8f93\u5165x, \u5f53\u524d \u7b56\u7565policy model\u751f\u6210\u597d\u7684response\u7684\u7d2f\u79ef\u6982\u7387(\u6bcf\u4e2atokne\u7684\u6982\u7387\u6c42\u548c\uff0c\u5177\u4f53\u770b\u4ee3\u7801)<\/li><li><em>\u03c0<sub>ref<\/sub>(yl|x)<\/em>&nbsp;:\u7ed9\u5b9a\u8f93\u5165x, <strong>\u539f\u59cb\u6a21\u578b(reference model)\u751f\u6210\u574f\u7684response\u7684\u7d2f\u79ef\u6982\u7387<\/strong><\/li><\/ul>\n\n\n\n<p>\u5f00\u59cb\u8bad\u7ec3\u65f6\uff0creference model\u548cpolicy model\u90fd\u662f\u540c\u4e00\u4e2a\u6a21\u578b\uff0c\u53ea\u4e0d\u8fc7\u5728\u8bad\u7ec3\u8fc7\u7a0b\u4e2dreference model\u4e0d\u4f1a\u66f4\u65b0\u6743\u91cd\u3002<\/p>\n\n\n\n<p>\u4e3a\u4e86\u65b9\u4fbf\u5206\u6790\uff0c\u6211\u4eec\u628alog\u91cc\u7684\u5206\u5f0f\u5c55\u5f00\uff0c\u7136\u540e\u03b2\u8bbe\u4e3a1\uff0c\u5e76\u4e14\u6682\u65f6\u4e0d\u770b\u524d\u9762\u7684log_sigmoid\uff0c\u90a3\u4e48\u4e0a\u9762\u7684loss\u53ef\u4ee5\u7b80\u5316\u4e3a\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-51.png\" alt=\"\" class=\"wp-image-24836\" width=\"472\" height=\"38\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-51.png 706w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-51-300x25.png 300w\" sizes=\"(max-width: 472px) 100vw, 472px\" \/><\/figure>\n\n\n\n<p>\u7531\u4e8e\u6700\u521dloss\u524d\u9762\u662f\u6709\u4e2a\u8d1f\u53f7\u7684\uff0c\u6240\u4ee5\u4f18\u5316\u76ee\u6807\u662f\u8ba9\u672c\u7b80\u5316\u516c\u5f0f\u6700\u5927\uff0c\u5373\u6211\u4eec\u5e0c\u671b\u5de6\u534a\u90e8\u5206\u548c\u53f3\u534a\u90e8\u5206\u7684margin\u8d8a\u5927\u8d8a\u597d\uff0c\u5de6\u534a\u90e8\u5206\u7684\u542b\u4e49\u662fgood response\u76f8\u8f83\u4e8e\u6ca1\u8bad\u7ec3\u4e4b\u524d\u7684\u7d2f\u79ef\u6982\u7387\u5dee\u503c\uff0c\u53f3\u534a\u90e8\u5206\u4ee3\u8868bad response\u76f8\u8f83\u4e8e\u6ca1\u8bad\u7ec3\u4e4b\u524d\u7684\u7d2f\u8ba1\u6982\u7387\u5dee\u503c\uff0c\u5982\u679c\u8fd9\u4e2a\u5dee\u503c\uff0c\u5373margin\u53d8\u5927\u4e86\uff0c\u5c31\u610f\u5473\u7740\uff1a<\/p>\n\n\n\n<ul><li>1\uff09\u5de6\u8fb9\u53d8\u5927\uff0c\u53f3\u8fb9\u53d8\u5c0f\uff0c\u7406\u60f3\u60c5\u51b5\uff0cgood response\u6982\u7387\u63d0\u5347\uff0cbad response\u6982\u7387\u4e0b\u964d<\/li><li>2\uff09\u5de6\u8fb9\u53d8\u5c0f\uff0c\u53f3\u8fb9\u66f4\u5c0f\uff0cgood response\u6982\u7387\u4e0b\u964d\uff0c\u4f46\u662fbad response\u6982\u7387\u4e0b\u964d\u7684\u66f4\u591a\uff0c\u751f\u6210\u7684\u65f6\u5019\u8fd8\u662f\u503e\u5411\u4e8egood response<\/li><li>3\uff09\u5de6\u8fb9\u53d8\u7684\u66f4\u5927\uff0c\u53f3\u8fb9\u53ea\u5927\u4e86\u4e00\u70b9\u70b9\uff0c\u548c2\uff09\u540c\u7406<\/li><\/ul>\n\n\n\n<p>\u6240\u4ee5\u8fd9\u4e2aloss\u9887\u6709\u4e00\u79cd<strong>\u5bf9\u6bd4<\/strong>\u7684\u611f\u89c9\u3002<\/p>\n\n\n\n<h2>OPA-DPO\uff1a\u591a\u6a21\u6001\u5927\u6a21\u578b\u5e7b\u89c9\u96be\u9898\u7684\u9ad8\u6548\u89e3\u51b3\u65b9\u6848<\/h2>\n\n\n\n<ul><li><strong><em>\u8bba\u6587\uff1a<a rel=\"noreferrer noopener\" href=\"https:\/\/arxiv.org\/pdf\/2501.09695\" target=\"_blank\">https:\/\/arxiv.org\/pdf\/2501.09695<\/a><\/em><\/strong><\/li><li><strong><em>\u4ee3\u7801\uff1a<a rel=\"noreferrer noopener\" href=\"https:\/\/github.com\/zhyang2226\/OPA-DPO\" target=\"_blank\">https:\/\/github.com\/zhyang2226\/OPA-DPO<\/a><\/em><\/strong><\/li><li><em>https:\/\/www.microsoft.com\/en-us\/research\/articles\/opa-dpo\/<\/em><\/li><\/ul>\n\n\n\n<p>\u5728\u89c6\u89c9\u591a\u6a21\u6001\u5927\u8bed\u8a00\u6a21\u578b\u9886\u57df\uff0c\u751f\u6210\u4e0e\u8f93\u5165\u56fe\u50cf\u4e0d\u4e00\u81f4\u751a\u81f3\u8fd8\u6709\u865a\u5047\u5185\u5bb9\u7684\u201c\u5e7b\u89c9\u201d\u73b0\u8c61\uff0c\u662f\u4e00\u4e2a\u4e9f\u5f85\u653b\u514b\u7684\u6838\u5fc3\u96be\u9898\u3002\u4f5c\u4e3a\u4e00\u79cd\u7b80\u5355\u6709\u6548\u7684\u89e3\u51b3\u65b9\u6848\uff0c\u76f4\u63a5\u504f\u597d\u4f18\u5316 (DPO) [1] \u6b63\u5728\u5f15\u8d77\u8d8a\u6765\u8d8a\u591a\u7684\u5173\u6ce8\u3002\u7814\u7a76\u8005\u4eec\u901a\u8fc7\u6bd4\u8f83\u6a21\u578b\u5728\u76f8\u540c\u63d0\u793a\u8bcd\u548c\u56fe\u50cf\u4e0b\u7684\u4e0d\u540c\u54cd\u5e94\uff0c\u6839\u636e\u5e7b\u89c9\u7a0b\u5ea6\u76f4\u63a5\u6784\u9020\u504f\u597d\u6570\u636e\u5bf9\uff0c\u7528\u4e8e DPO \u8bad\u7ec3\u3002<\/p>\n\n\n\n<p>\u7136\u800c\uff0c\u5fae\u8f6f\u4e9a\u6d32\u7814\u7a76\u9662\u7684\u7814\u7a76\u5458\u4eec\u6ce8\u610f\u5230\uff0c\u73b0\u6709\u7814\u7a76\u4e2d\u4e0d\u540c\u7684\u6570\u636e\u6784\u9020\u65b9\u6cd5\u4f1a\u5bfc\u81f4\u663e\u8457\u7684\u6027\u80fd\u5dee\u5f02\u3002\u56e0\u6b64\uff0c\u4ed6\u4eec\u5bf9\u201c\u57fa\u4e8e DPO \u89e3\u51b3\u591a\u6a21\u6001\u5927\u6a21\u578b\u5e7b\u89c9\u95ee\u9898\u201d\u7684\u7b97\u6cd5\u8fdb\u884c\u4e86\u5168\u9762\u5206\u6790\uff0c\u603b\u7ed3\u4e86\u5b83\u4eec\u7684\u8868\u73b0\u53ca\u5c40\u9650\u6027\uff0c\u540c\u65f6\u4ece\u7406\u8bba\u89d2\u5ea6\u63ed\u793a\u4e86\u5404\u7b97\u6cd5\u6027\u80fd\u5dee\u5f02\u80cc\u540e\u7684\u6839\u672c\u539f\u56e0\uff0c\u5e76\u6307\u51fa\u51b3\u5b9a\u6a21\u578b\u6027\u80fd\u7684\u6700\u5173\u952e\u56e0\u7d20\u662f\u201c\u7528\u4e8e\u6784\u5efa\u504f\u597d\u5bf9\u7684\u6570\u636e\uff0c\u76f8\u8f83\u4e8e DPO \u5f00\u59cb\u524d\u7684\u7b56\u7565\uff08reference policy\uff09\u662f\u5426\u4e3a\u540c\u7b56\u7565\uff08on-policy\uff09\u201d\u3002<\/p>\n\n\n\n<p>\u7814\u7a76\u5458\u4eec\u5c06\u6b64\u524d\u7684\u7814\u7a76\u5de5\u4f5c\u5206\u4e3a\u4e09\u7c7b\uff1a<\/p>\n\n\n\n<p>\u7b2c\u4e00\u7c7b\u662f\u5e7b\u89c9\u6ce8\u5165\u7c7b\uff0c\u5982 HALVA [2]\u548c POVID [3]\uff0c\u901a\u8fc7\u5728\u5df2\u6709\u56fe\u50cf\u548c\u63d0\u793a\u7684\u6807\u51c6\u54cd\u5e94\u4e2d\u4eba\u4e3a\u6ce8\u5165\u5e7b\u89c9\u7247\u6bb5\u6765\u6784\u5efa\u504f\u597d\u5bf9\uff1b<\/p>\n\n\n\n<p>\u7b2c\u4e8c\u7c7b\u662f\u5e7b\u89c9\u8bc6\u522b\u7c7b\uff0c\u5982 RLHF-V [4]\u3001HA-DPO [5]\u548c HSA-DPO [6]\uff0c\u5148\u8ba9\u6a21\u578b\u6839\u636e\u56fe\u50cf\u548c\u63d0\u793a\u81ea\u884c\u751f\u6210\u54cd\u5e94\uff0c\u7136\u540e\u5229\u7528\u4e13\u5bb6\u53cd\u9988\uff08\u4eba\u7c7b\u6216 GPT-4\/4v\uff09\u6765\u8bc6\u522b\u548c\u4fee\u6539\u5176\u4e2d\u7684\u5e7b\u89c9\uff0c\u4ece\u800c\u6784\u5efa\u504f\u597d\u5bf9\uff1b<\/p>\n\n\n\n<p>\u7b2c\u4e09\u7c7b\u662f\u81ea\u6211\u8fdb\u5316\u7c7b\uff0c\u5982 RLAIF-V[7]\uff0c\u8ba9\u6a21\u578b\u9488\u5bf9\u540c\u4e00\u56fe\u50cf\u548c\u63d0\u793a\u751f\u6210\u591a\u4e2a\u54cd\u5e94\uff0c\u5e76\u7531\u4e00\u4e2a\u5728\u5e7b\u89c9\u8bc6\u522b\u65b9\u9762\u80fd\u529b\u66f4\u5f3a\u7684\u5bfc\u5e08\u6a21\u578b\u5bf9\u8fd9\u4e9b\u54cd\u5e94\u4e2d\u7684\u5e7b\u89c9\u4e25\u91cd\u7a0b\u5ea6\u8fdb\u884c\u5224\u65ad\u548c\u6392\u5e8f\uff0c\u4ee5\u6b64\u6784\u5efa\u504f\u597d\u5bf9\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"1024\" height=\"500\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-43-1024x500.png\" alt=\"\" class=\"wp-image-26967\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-43-1024x500.png 1024w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-43-300x146.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-43-768x375.png 768w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-43.png 1444w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p>\u6839\u636e\u5b9e\u9a8c\u7ed3\u679c\uff0c\u8fd9\u4e09\u7c7b\u7b97\u6cd5\u7684\u6027\u80fd\u603b\u7ed3\u4e3a\uff1a\u81ea\u6211\u8fdb\u5316\u7c7b &gt; \u5e7b\u89c9\u8bc6\u522b\u7c7b &gt; \u5e7b\u89c9\u6ce8\u5165\u7c7b\u3002<\/p>\n\n\n\n<p>\u5bf9\u4e8e\u5e7b\u89c9\u6ce8\u5165\u7c7b\uff0c\u5e7b\u89c9\u901a\u5e38\u5e76\u4e0d\u6765\u81ea\u6a21\u578b\u672c\u8eab\uff0c\u56e0\u6b64\u901a\u8fc7 DPO \u8bad\u7ec3\u5f80\u5f80\u4e0d\u80fd\u7ed9\u6a21\u578b\u5e26\u6765\u5f88\u5927\u589e\u76ca\u3002\u5bf9\u4e8e\u81ea\u6211\u8fdb\u5316\u7c7b\uff0c\u7406\u8bba\u4e0a\u7531\u4e8e\u7ef4\u5ea6\u707e\u96be\u95ee\u9898\uff0c\u8ba9\u6a21\u578b\u81ea\u884c\u63a2\u7d22\u5e76\u627e\u5230\u5b8c\u5168\u6b63\u786e\u7684\u56de\u590d\u662f\u5341\u5206\u56f0\u96be\u7684\uff0c\u6240\u4ee5\u90a3\u4e9b\u5b58\u5728\u4e8e\u591a\u4e2a\u56de\u590d\u4e2d\u7684\u987d\u56fa\u5e7b\u89c9\u901a\u5e38\u65e0\u6cd5\u901a\u8fc7\u8fd9\u79cd\u65b9\u6cd5\u6d88\u9664\u3002<\/p>\n\n\n\n<p>\u76f4\u89c9\u4e0a\uff0c\u5e7b\u89c9\u8bc6\u522b\u7c7b\u7684\u65b9\u6cd5\u5e94\u8be5\u662f\u6700\u9ad8\u6548\u7684\u89e3\u51b3\u5e7b\u89c9\u7684\u65b9\u6848\uff0c\u90a3\u4e3a\u4ec0\u4e48\u5728\u5b9e\u8df5\u4e2d\u8fd9\u7c7b\u65b9\u6cd5\u5374\u8d25\u4e0b\u9635\u6765\uff1f\u4e3a\u4e86\u4e86\u89e3\u80cc\u540e\u7684\u539f\u56e0\uff0c\u7814\u7a76\u5458\u4eec\u4ece DPO \u7b97\u6cd5\u7684\u7ec6\u8282\u5165\u624b\u8fdb\u884c\u7814\u7a76\u3002<\/p>\n\n\n\n<p>\u4e0e\u6700\u5e38\u7528\u7684 RLHF \u7b97\u6cd5 PPO \u7684\u521d\u59cb\u76ee\u6807\u76f8\u540c\uff0cDPO \u7684\u521d\u59cb\u76ee\u6807\u4e5f\u662f\uff08\u03c0_\u03b8 \u662f\u6a21\u578b\u7684\u5f53\u524d\u7b56\u7565\uff0c\u03c0_ref \u662f\u6a21\u578b\u7684\u521d\u59cb\u7b56\u7565\/\u53c2\u8003\u7b56\u7565\uff0cx \u4e3a\u63d0\u793a\u8bcd\uff0cm \u4e3a\u56fe\u50cf\uff0cy \u4e3a\u54cd\u5e94\uff0cr(x,y,m) \u662f\u901a\u8fc7 Bradley-Terry model \u8bad\u7ec3\u5f97\u5230\u7684\u5956\u52b1\u51fd\u6570\uff09\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image is-resized\"><img loading=\"lazy\" src=\"https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2025\/06\/opa-dpo-2.png\" alt=\"formular\" class=\"wp-image-1141042\" width=\"522\" height=\"55\"\/><\/figure>\n\n\n\n<p>\u5373\u5728\u6700\u5927\u5316\u5956\u52b1\u7684\u540c\u65f6\uff0c\u7ea6\u675f\u6a21\u578b\u5f53\u524d\u7b56\u7565\u4e0e\u6a21\u578b\u521d\u59cb\u7b56\u7565\u4e4b\u95f4\u7684 KL \u6563\u5ea6\u3002\u7136\u800c\uff0c\u7814\u7a76\u5458\u4eec\u91cd\u65b0\u5ba1\u89c6 KL \u6563\u5ea6\u7684\u5b9a\u4e49\u53d1\u73b0\uff0c\u7ed9\u5b9a\u4efb\u4f55\u4e00\u4e2a\u63d0\u793a\u8bcd\u548c\u56fe\u50cf (x,m)\uff0c\u82e5\u5b58\u5728\u4e00\u4e2a\u54cd\u5e94 (y) \u4f7f\u5f97 \u03c0_\u03b8(y|x,m)&gt;0\uff0c\u4f46 \u03c0_ref(y|x,m)\u21920\uff0c\u6b64\u65f6 KL \u6563\u5ea6\u4f1a\u8d8b\u4e8e\u65e0\u7a77\u5927\u3002\u8fd9\u4e2a\u6027\u8d28\u8bf4\u660e\u2014\u2014\u5bf9\u4e8e\u4efb\u4f55\u4ece\u76ee\u6807\u51fd\u6570 (1) \u51fa\u53d1\u7684\u7b97\u6cd5\uff0c\u90a3\u4e9b\u76f8\u5bf9\u539f\u59cb\u7b56\u7565 (\u03c0_ref) \u91c7\u6837\u6982\u7387\u6781\u4f4e\u7684\u54cd\u5e94\uff08\u6839\u636e\u5f3a\u5316\u5b66\u4e60\u7684\u547d\u540d\u89c4\u8303\uff0c\u8fd9\u79cd\u6570\u636e\u88ab\u79f0\u4e3a\u5f02\u7b56\u7565\uff08off-policy\uff09\u6570\u636e\uff0c\u76f8\u53cd\u5219\u4e3a\u540c\u7b56\u7565\uff08on-policy\uff09\u6570\u636e\uff09\u5c06\u6ca1\u6709\u4efb\u4f55\u673a\u4f1a\u88ab\u6a21\u578b\u5b66\u4f1a\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image is-resized\"><img loading=\"lazy\" src=\"https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2025\/06\/opa-dpo-3.png\" alt=\"formular\" class=\"wp-image-1141043\" width=\"387\" height=\"73\"\/><\/figure>\n\n\n\n<p>\u5982\u679c\u975e\u8981\u5c06\u8fd9\u4e9b\u5f02\u7b56\u7565\uff08off-policy\uff09\u7684\u4f18\u9009\u54cd\u5e94\uff08preferred response\uff09\u62ff\u6765\u6784\u5efa DPO \u504f\u597d\u5bf9\uff0c\u4f1a\u5bfc\u81f4\u68af\u5ea6\u5728\u4e0b\u4e00\u6b21\u66f4\u65b0\u65f6\u51e0\u4e4e\u6d88\u5931\u3002<\/p>\n\n\n\n<p>\u91cd\u6e29 DPO \u8bad\u7ec3\u7684\u4f18\u5316\u76ee\u6807\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image is-resized\"><img loading=\"lazy\" src=\"https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2025\/06\/opa-dpo-4-1024x122.png\" alt=\"text\" class=\"wp-image-1141044\" width=\"499\" height=\"59\"\/><\/figure>\n\n\n\n<p>\u5176\u4e2d y_w \u662f\u4f18\u9009\u54cd\u5e94\uff08preferred response\uff09\uff0cy_l \u662f\u88ab\u62d2\u54cd\u5e94\uff08rejected response\uff09\uff0c\u5176\u68af\u5ea6\u53ef\u8868\u793a\u4e3a\uff08\u03c3(\u22c5) \u662f sigmoid \u51fd\u6570\uff09\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image is-resized\"><img loading=\"lazy\" src=\"https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2025\/06\/opa-dpo-5-1024x274.png\" alt=\"diagram\" class=\"wp-image-1141045\" width=\"433\" height=\"115\"\/><\/figure>\n\n\n\n<p>\u8bad\u7ec3\u5f00\u59cb\u524d \u03c0<em>\u03b8=\u03c0_ref\uff0c\u6240\u4ee5 sigmoid \u51fd\u6570\u5185\u90e8\u7684\u503c\u5e94\u5f53\u4e3a0\uff0c\u5373\u5f53\u524d\u7b56\u7565\u4f1a\u4ee5 0.5\u03b2 \u4e3a\u7cfb\u6570\u5bf9 y_w \u8fdb\u884c\u6700\u5927\u5bf9\u6570\u4f3c\u7136\u66f4\u65b0\uff08max-loglikelihood update\uff09\u3002\u4f46\u662f\u5728\u8fd9\u4e00\u6b65\u66f4\u65b0\u8fc7\u540e\uff0clog\u03c0_ref(y_w\u2223x,m)\u03c0<\/em>\u03b8(y_w\u2223x,m) \u5c06\u4f1a\u8d8b\u8fd1\u4e8e\u6781\u5927\u503c\uff08\u56e0\u4e3a\u5206\u5b50 &gt; 0\uff0c\u800c\u5206\u6bcd\u8d8b\u8fd1\u4e8e0\uff09\uff0c\u4ece\u800c\u5bfc\u81f4 \u03c3(\u2212r_w+r_l)\u21920\u3002\u56e0\u6b64\uff0c\u68af\u5ea6\u4f1a\u5728\u4e0b\u4e00\u6b21\u66f4\u65b0\u65f6\u51e0\u4e4e\u6d88\u5931\u3002<\/p>\n\n\n\n<p>\u56de\u987e\u5e7b\u89c9\u8bc6\u522b\u7c7b\u7684\u65b9\u6cd5\uff0c\u4e13\u5bb6\u6539\u52a8\u540e\u7684\u54cd\u5e94\uff0c\u5927\u90e8\u5206\u5bf9\u4e8e\u539f\u6a21\u578b\u6765\u8bf4\u90fd\u662f\u5f02\u7b56\u7565\uff08off-policy\uff09\u7684\uff0c\u5373\u4f7f\u8fd9\u4e9b\u6539\u52a8\u518d\u5fae\u5c0f\u4e5f\u65e0\u6d4e\u4e8e\u4e8b\uff0c\u6240\u4ee5\u6839\u672c\u65e0\u6cd5\u6307\u671b\u8fd9\u4e9b\u4e13\u5bb6\u53cd\u9988\u80fd\u88ab\u6a21\u578b\u5b66\u4f1a\u3002\u76f8\u5bf9\u5e94\u5730\uff0c\u81ea\u6211\u8fdb\u5316\u7c7b\u65b9\u6cd5\u5373\u4f7f\u5b58\u5728\u5b66\u4e60\u6548\u7387\u4e0d\u9ad8\u7684\u6f5c\u5728\u95ee\u9898\uff0c\u4f46\u662f\u5b83\u6784\u5efa\u7684\u504f\u597d\u5bf9\u90fd\u6765\u81ea\u6a21\u578b\u672c\u8eab\uff0c\u5373\u5168\u662f\u540c\u7b56\u7565\uff08on-policy\uff09\u7684\uff0c\u56e0\u6b64\u6548\u679c\u6700\u597d\u3002<\/p>\n\n\n\n<p id=\"opa-dpo-\u6253\u7834\u5e38\u89c4-\u91cd\u5851\u5bf9\u9f50\u7b56\u7565\"><strong>OPA-DPO\uff1a\u6253\u7834\u5e38\u89c4\uff0c\u91cd\u5851\u5bf9\u9f50\u7b56\u7565<\/strong><\/p>\n\n\n\n<p>\u662f\u5426\u5b58\u5728\u4e00\u79cd\u65b9\u6cd5\u65e2\u80fd\u591f\u5229\u7528\u4e13\u5bb6\u7684\u7cbe\u786e\u53cd\u9988\uff0c\u53c8\u80fd\u5b8c\u5168\u907f\u514d\u5f02\u7b56\u7565\uff08off-policy\uff09\u5bfc\u81f4\u7684 KL \u6563\u5ea6\u7ea6\u675f\u95ee\u9898\uff1f<\/p>\n\n\n\n<p>\u9488\u5bf9\u73b0\u6709\u65b9\u6cd5\u7684\u5c40\u9650\u6027\uff0c\u5fae\u8f6f\u4e9a\u6d32\u7814\u7a76\u9662\u8054\u5408\u9999\u6e2f\u4e2d\u6587\u5927\u5b66\u63d0\u51fa\u4e86\u4e00\u79cd\u7b80\u5355\u800c\u9ad8\u6548\u7684\u7b97\u6cd5 On-Policy Alignment(OPA)-DPO\uff0c\u5c06\u4e13\u5bb6\u7684\u7cbe\u786e\u53cd\u9988\u6570\u636e\u5728 DPO \u8bad\u7ec3\u524d\u4e0e\u6a21\u578b\u7b56\u7565\u5bf9\u9f50\u3002\u5728\u4ec5\u4f7f\u75284.8k\u6570\u636e\u7684\u60c5\u51b5\u4e0b\uff0cOPA-DPO \u53ef\u4ee5\u5b9e\u73b0\u76ee\u524d SOTA \u7684\u6027\u80fd\uff0c\u800c\u4e4b\u524d\u7684 SOTA \u7b97\u6cd5\u9700\u898116k\u6570\u636e\u3002\u8be5\u6210\u679c\u5df2\u83b7\u9009\u8ba1\u7b97\u673a\u89c6\u89c9\u9886\u57df\u9876\u4f1a CVPR 2025 \u7684 Oral \u8bba\u6587\u3002<\/p>\n\n\n\n<p>Mitigating Hallucinations in Large Vision-Language Models via DPO: On-Policy Data Hold the Key<\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"774\" height=\"718\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-44.png\" alt=\"\" class=\"wp-image-26971\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-44.png 774w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-44-300x278.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-44-768x712.png 768w\" sizes=\"(max-width: 774px) 100vw, 774px\" \/><\/figure>\n\n\n\n<p>OPA-DPO \u7684\u5177\u4f53\u5b9e\u73b0\u65b9\u6cd5\u5982\u4e0b\uff1a\u9996\u5148\uff0c\u7ed9\u5b9a\u56fe\u50cf\u548c\u63d0\u793a\uff0c\u8ba9\u6a21\u578b\u81ea\u884c\u751f\u6210\u5bf9\u5e94\u7684\u54cd\u5e94\uff1b\u63a5\u7740\uff0c\u5229\u7528\u4e13\u5bb6\u53cd\u9988\uff08\u5982 GPT-4v\uff09\u5bf9\u751f\u6210\u5185\u5bb9\u8fdb\u884c\u7ec6\u7c92\u5ea6\u4fee\u6539\uff0c\u4fdd\u7559\u6b63\u786e\u7684\u54cd\u5e94\u90e8\u5206\uff0c\u540c\u65f6\u7ea0\u6b63\u5176\u4e2d\u5b58\u5728\u7684\u5e7b\u89c9\u5185\u5bb9\uff1b\u7136\u540e\uff0c\u5c06\u6570\u636e\u96c6\u4e2d\u7684\u771f\u5b9e\u54cd\u5e94\u4e0e\u4e13\u5bb6\u4fee\u6539\u540e\u7684\u54cd\u5e94\u8fdb\u884c LoRA-SFT \u5fae\u8c03\uff0c\u5f97\u5230\u4e00\u4e2a\u65b0\u7684\u6a21\u578b\uff08\u7814\u7a76\u5458\u4eec\u5c06\u5176\u79f0\u4e3a OPA \u6a21\u578b\uff09\uff1b\u6700\u540e\uff0c\u5728 OPA \u6a21\u578b\u7684\u57fa\u7840\u4e0a\uff0c\u8fdb\u884c\u540e\u7eed\u7684 DPO \u8bad\u7ec3\uff0c\u5176\u4e2d\u7814\u7a76\u5458\u4eec\u53c2\u8003\u4e86 mDPO \u7684\u8bbe\u5b9a\uff0c\u5728\u6784\u5efa\u8bed\u8a00\u504f\u597d\u5bf9\u7684\u540c\u65f6\u4e5f\u6784\u5efa\u4e86\u56fe\u50cf\u504f\u597d\u5bf9\u4ee5\u53ca\u951a\u70b9\u5bf9\uff0c\u5c3d\u7ba1\u8fd9\u4e9b\u5143\u7d20\u90fd\u5f88\u91cd\u8981\uff0c\u4f46\u5bf9\u6700\u7ec8\u7ed3\u679c\u5f71\u54cd\u6700\u5927\u7684\u8fd8\u662f OPA \u64cd\u4f5c\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"793\" height=\"685\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-45.png\" alt=\"\" class=\"wp-image-26973\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-45.png 793w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-45-300x259.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-45-768x663.png 768w\" sizes=\"(max-width: 793px) 100vw, 793px\" \/><\/figure>\n\n\n\n<p>\u7814\u7a76\u5458\u4eec\u7efc\u5408\u6bd4\u8f83\u4e86\u57fa\u4e8e LLaVA-1.5-7B \u548c 13B \u6a21\u578b\u5fae\u8c03\u7684\u5404\u79cd DPO-based \u7684\u7b97\u6cd5\uff0cOPA-DPO \u5728\u4f7f\u7528 4.8k \u6570\u636e\u7684\u60c5\u51b5\u4e0b\u53ef\u5728\u591a\u4e2a\u6307\u6807\u4e0a\u5b9e\u73b0 SOTA \u6548\u679c\u3002<\/p>\n\n\n\n<p>\u7814\u7a76\u5458\u4eec\u53d1\u73b0\uff0c\u4f7f\u7528 OPA-DPO \u8bad\u7ec3\u8fc7\u7684\u6a21\u578b\u4f1a\u5448\u73b0\u51fa\u4e00\u79cd\u7565\u663e\u4fdd\u5b88\u7684\u7b56\u7565\uff0c\u5c24\u5176\u662f\u5728\u63cf\u8ff0\u4efb\u52a1\u4e2d\uff0c\u5b83\u901a\u5e38\u53ea\u8f93\u51fa\u663e\u8457\u5e76\u4e14\u786e\u5b9a\u7684\u89c2\u6d4b\uff0c\u800c\u5ffd\u7565\u4e00\u4e9b\u4e0d\u91cd\u8981\u7684\u7ec6\u8282\u3002<\/p>\n\n\n\n<p>\u4e0d\u4ec5\u5982\u6b64\uff0c\u7814\u7a76\u5458\u4eec\u8fd8\u89c2\u6d4b\u5230\u4e00\u4e2a\u6709\u8da3\u7684\u73b0\u8c61\uff1a\u57fa\u5ea7\u6a21\u578b\u5f80\u5f80\u9ed8\u8ba4 query \u4e2d\u7684\u8bed\u8a00\u662f\u51c6\u786e\u65e0\u8bef\u7684\uff0c\u5373\u4f7f\u8fd9\u90e8\u5206\u6587\u5b57\u5b58\u5728\u4e25\u91cd\u5e7b\u89c9\uff0c\u6a21\u578b\u4e5f\u4f1a\u987a\u7740\u5176\u63cf\u8ff0\u56fe\u7247\uff0c\u8fd9\u6216\u8bb8\u53ef\u4ee5\u7406\u89e3\u4e3a\u4e00\u79cd\u6587\u5b57\u60ef\u6027\u73b0\u8c61\u3002\u800c\u901a\u8fc7 OPA-DPO \u8bad\u7ec3\u7684\u6a21\u578b\u5219\u5c55\u73b0\u51fa\u4e86\u7504\u522b query \u6587\u5b57\u90e8\u5206\u5e7b\u89c9\u7684\u80fd\u529b\u3002<\/p>\n\n\n\n<h2>PPO\uff08Proximal Policy Optimization\uff0c\u8fd1\u7aef\u7b56\u7565\u4f18\u5316\uff09<\/h2>\n\n\n\n<p><strong>\u8bba\u6587\uff1a<a rel=\"noreferrer noopener\" href=\"https:\/\/arxiv.org\/abs\/1707.06347\" target=\"_blank\">https:\/\/arxiv.org\/abs\/1707.06347<\/a><\/strong><\/p>\n\n\n\n<ul><li><strong><em><a href=\"https:\/\/zhuanlan.zhihu.com\/p\/677607581\">https:\/\/zhuanlan.zhihu.com\/p\/677607581<\/a><\/em><\/strong><\/li><li><a href=\"https:\/\/zhuanlan.zhihu.com\/p\/13467768873\"><strong>https:\/\/zhuanlan.zhihu.com\/p\/13467768873<\/strong><\/a><\/li><li><a href=\"https:\/\/zhuanlan.zhihu.com\/p\/677607581\">\u56fe\u89e3\u5927\u6a21\u578bRLHF\u7cfb\u5217\u4e4b\uff1a\u4eba\u4eba\u90fd\u80fd\u770b\u61c2\u7684PPO\u539f\u7406\u4e0e\u6e90\u7801\u89e3\u8bfb<\/a><\/li><\/ul>\n\n\n\n<p><a href=\"https:\/\/zhuanlan.zhihu.com\/p\/7461863937\" target=\"_blank\" rel=\"noreferrer noopener\"><strong>\u4eba\u4eba\u90fd\u80fd\u770b\u61c2\u7684RL-PPO\u7406\u8bba\u77e5\u8bc6<\/strong><\/a><\/p>\n\n\n\n<p>\u662fOpenAI\u57282017\u63d0\u51fa\u7684\u4e00\u79cd\u5f3a\u5316\u5b66\u4e60\u7b97\u6cd5\uff0c\u662f\u57fa\u4e8e\u7b56\u7565\u4f18\u5316\u7684\u7b97\u6cd5\uff0c\u7528\u4e8e\u8bad\u7ec3\u80fd\u591f\u6700\u5927\u5316\u7d2f\u79ef\u5956\u52b1\u7684\u667a\u80fd\u4f53\u3002<strong>PPO\u7b97\u6cd5\u901a\u8fc7\u5728\u6bcf\u6b21\u66f4\u65b0\u65f6\u9650\u5236\u65b0\u7b56\u7565\u4e0e\u65e7\u7b56\u7565\u4e4b\u95f4\u7684\u5dee\u5f02\uff0c\u4ece\u800c\u66f4\u7a33\u5b9a\u5730\u66f4\u65b0\u7b56\u7565\u53c2\u6570\u3002<\/strong>\u8fd9\u79cd\u65b9\u6cd5\u6709\u52a9\u4e8e\u907f\u514d\u8bad\u7ec3\u8fc7\u7a0b\u4e2d\u51fa\u73b0\u7684\u4e0d\u7a33\u5b9a\u6027\u548c\u5267\u70c8\u6ce2\u52a8\uff0c\u4f7f\u5f97\u7b97\u6cd5\u66f4\u5bb9\u6613\u6536\u655b\u5e76\u5b66\u4e60\u5230\u66f4\u597d\u7684\u7b56\u7565\u3002<\/p>\n\n\n\n<h3><a href=\"https:\/\/zhuanlan.zhihu.com\/p\/677607581\" target=\"_blank\" rel=\"noreferrer noopener\">\u5f3a\u5316\u5b66\u4e60\u57fa\u672c\u6982\u5ff5<\/a><\/h3>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-93.png\" alt=\"\" class=\"wp-image-25249\" width=\"457\" height=\"204\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-93.png 571w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-93-300x133.png 300w\" sizes=\"(max-width: 457px) 100vw, 457px\" \/><\/figure>\n\n\n\n<ul><li>\u5f3a\u5316\u5b66\u4e60\u7684\u4e24\u4e2a\u5b9e\u4f53\uff1a<strong>\u667a\u80fd\u4f53\uff08Agent\uff09<\/strong>\u4e0e<strong>\u73af\u5883\uff08Environment\uff09<\/strong><\/li><li>\u5f3a\u5316\u5b66\u4e60\u4e2d\u4e24\u4e2a\u5b9e\u4f53\u7684\u4ea4\u4e92\uff1a<ul><li><strong>\u72b6\u6001\u7a7a\u95f4S<\/strong>\uff1aS\u5373\u4e3aState\uff0c\u6307\u73af\u5883\u4e2d\u6240\u6709\u53ef\u80fd\u72b6\u6001\u7684\u96c6\u5408<\/li><li><strong>\u52a8\u4f5c\u7a7a\u95f4A<\/strong>\uff1aA\u5373\u4e3aAction\uff0c\u6307\u667a\u80fd\u4f53\u6240\u6709\u53ef\u80fd\u52a8\u4f5c\u7684\u96c6\u5408<\/li><li><strong>\u5956\u52b1R\uff1a<\/strong>R\u5373\u4e3aReward\uff0c\u6307\u667a\u80fd\u4f53\u5728\u73af\u5883\u7684\u67d0\u4e00\u72b6\u6001\u4e0b\u6240\u83b7\u5f97\u7684\u5956\u52b1\u3002<\/li><\/ul><\/li><\/ul>\n\n\n\n<p>\u4ee5\u4e0a\u56fe\u4e3a\u4f8b\uff0c\u667a\u80fd\u4f53\u4e0e\u73af\u5883\u7684\u4ea4\u4e92\u8fc7\u7a0b\u5982\u4e0b\uff1a<\/p>\n\n\n\n<ul><li>\u5728&nbsp;t&nbsp;\u65f6\u523b\uff0c\u73af\u5883\u7684\u72b6\u6001\u4e3a&nbsp;St&nbsp;\uff0c\u8fbe\u5230\u8fd9\u4e00\u72b6\u6001\u6240\u83b7\u5f97\u7684\u5956\u52b1\u4e3a<strong>&nbsp;Rt<\/strong><\/li><li>\u667a\u80fd\u4f53\u89c2\u6d4b\u5230&nbsp;St&nbsp;\u4e0e&nbsp;Rt&nbsp;\uff0c\u91c7\u53d6\u76f8\u5e94\u52a8\u4f5c&nbsp;At<\/li><li>\u667a\u80fd\u4f53\u91c7\u53d6&nbsp;At&nbsp;\u540e\uff0c\u73af\u5883\u72b6\u6001\u53d8\u4e3a&nbsp;St+1&nbsp;\uff0c\u5f97\u5230\u76f8\u5e94\u7684\u5956\u52b1&nbsp;Rt+1<\/li><\/ul>\n\n\n\n<p>\u5956\u52b1\u503c&nbsp;<strong>Rt<\/strong>&nbsp;\uff0c\u5b83\u8868\u793a\u73af\u5883\u8fdb\u5165\u72b6\u6001&nbsp;St&nbsp;\u4e0b\u7684<strong>\u5373\u65f6\u5956\u52b1<\/strong>\u3002<br><strong>\u4f46\u5982\u679c\u53ea\u8003\u8651\u5373\u65f6\u5956\u52b1\uff0c\u76ee\u5149\u4f3c\u4e4e\u592a\u77ed\u6d45\u4e86<\/strong>\uff1a\u5f53\u4e0b\u7684\u72b6\u6001\u548c\u52a8\u4f5c\u4f1a\u5f71\u54cd\u5230\u672a\u6765\u7684\u72b6\u6001\u548c\u52a8\u4f5c\uff0c\u8fdb\u800c\u5f71\u54cd\u5230\u672a\u6765\u7684\u6574\u4f53\u6536\u76ca\u3002<br>\u6240\u4ee5\uff0c\u4e00\u79cd\u66f4\u597d\u7684\u8bbe\u8ba1\u65b9\u5f0f\u662f\uff1a<strong>t\u65f6\u523b\u72b6\u6001s\u7684\u603b\u6536\u76ca = \u8eab\u5904\u72b6\u6001s\u80fd\u5e26\u6765\u7684<u>\u5373\u65f6\u6536\u76ca<\/u>&nbsp;+ \u4ece\u72b6\u6001s\u51fa\u53d1\u540e\u80fd\u5e26\u6765\u7684<u>\u672a\u6765\u6536\u76ca<\/u>\u3002<\/strong>\u5199\u6210\u8868\u8fbe\u5f0f\u5c31\u662f\uff1a<strong>Vt=Rt+\u03b3Vt+1<\/strong><\/p>\n\n\n\n<p>\u5176\u4e2d\uff1a<\/p>\n\n\n\n<ul><li><strong>Vt&nbsp;<\/strong>\uff1a&nbsp;t&nbsp;\u65f6\u523b\u7684\u603b\u6536\u76ca\uff0c\u6ce8\u610f\u8fd9\u4e2a\u6536\u76ca\u8574\u6db5\u4e86\u201c\u5373\u65f6\u201d\u548c\u201c\u672a\u6765\u201d\u7684\u6982\u5ff5<\/li><li><strong>Rt<\/strong>&nbsp;\uff1a&nbsp;t&nbsp;\u65f6\u523b\u7684\u5373\u65f6\u6536\u76ca<\/li><li>Vt+1&nbsp;\uff1a&nbsp;t+1&nbsp;\u65f6\u523b\u7684\u603b\u6536\u76ca\uff0c\u6ce8\u610f\u8fd9\u4e2a\u6536\u76ca\u8574\u6db5\u4e86\u201c\u5373\u65f6\u201d\u548c\u201c\u672a\u6765\u201d\u7684\u6982\u5ff5\u3002\u800c&nbsp;Vt+1&nbsp;\u5bf9&nbsp;Vt&nbsp;\u6765\u8bf4\u5c31\u662f\u201c\u672a\u6765\u201d\u3002<\/li><li>\u03b3&nbsp;\uff1a\u6298\u6263\u56e0\u5b50\u3002\u5b83\u51b3\u5b9a\u4e86\u6211\u4eec\u5728\u591a\u5927\u7a0b\u5ea6\u4e0a\u8003\u8651\u5c06\u201c\u672a\u6765\u6536\u76ca\u201d\u7eb3\u5165\u201c\u5f53\u4e0b\u6536\u76ca\u201d\u3002<\/li><\/ul>\n\n\n\n<p><strong>\u5173\u952e\u6982\u5ff5\uff1a<\/strong><\/p>\n\n\n\n<p><strong>\u7b56\u7565\u51fd\u6570<\/strong>\u662f\u4e00\u4e2a \u6982\u7387\u5bc6\u5ea6\u51fd\u6570\uff08PDF\uff09\uff0c\u8f93\u5165\u65f6\u5f53\u524d\u72b6\u6001s\uff0c\u8f93\u51fa\u4e3a\u4e00\u4e2a\u6982\u7387\u5206\u5e03\uff0c\u8868\u5f81\u6bcf\u4e2a action \u7684\u6982\u7387\uff1a<\/p>\n\n\n\n<div class=\"wp-block-image\"><figure class=\"aligncenter size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-94.png\" alt=\"\" class=\"wp-image-25257\" width=\"255\" height=\"25\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-94.png 322w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-94-300x29.png 300w\" sizes=\"(max-width: 255px) 100vw, 255px\" \/><\/figure><\/div>\n\n\n\n<p><strong>\u52a8\u4f5c\u503c\u51fd\u6570\uff1a<\/strong>\u8bc4\u4ef7\u5728\u72b6\u6001&nbsp;st&nbsp;\u7684\u60c5\u51b5\u4e0b\u505a\u51fa\u52a8\u4f5c&nbsp;at\u7684\u597d\u574f\u7a0b\u5ea6\u3002<\/p>\n\n\n\n<div class=\"wp-block-image\"><figure class=\"aligncenter size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-95.png\" alt=\"\" class=\"wp-image-25260\" width=\"330\" height=\"33\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-95.png 337w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-95-300x30.png 300w\" sizes=\"(max-width: 330px) 100vw, 330px\" \/><\/figure><\/div>\n\n\n\n<p><strong>\u72b6\u6001\u503c\u51fd\u6570\uff1a<\/strong><\/p>\n\n\n\n<div class=\"wp-block-image\"><figure class=\"aligncenter size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-96.png\" alt=\"\" class=\"wp-image-25267\" width=\"184\" height=\"30\"\/><\/figure><\/div>\n\n\n\n<ul><li>\u6d88\u6389\u4e86\u52a8\u4f5c A \uff0c\u8fd9\u6837&nbsp;V\u03c0&nbsp;\u53ea\u8ddf\u72b6\u6001 s \u4e0e\u7b56\u7565\u51fd\u6570&nbsp;\u03c0&nbsp;\u6709\u5173\u4e86\u3002<\/li><li>\u7ed9\u5b9a&nbsp;\u03c0\uff0c\u53ef\u4ee5\u8bc4\u4ef7\u5f53\u524d\u72b6\u6001\u7684\u597d\u574f\uff1b\u7ed9\u5b9a\u72b6\u6001st\uff0c\u53ef\u4ee5\u8bc4\u4ef7\u7b56\u7565&nbsp;\u03c0\u7684\u597d\u574f\u3002<\/li><\/ul>\n\n\n\n<p><strong>\u4f18\u52bf\u51fd\u6570<\/strong>\uff1a\u6709\u4e9b\u65f6\u5019\u6211\u4eec\u4e0d\u9700\u8981\u63cf\u8ff0\u4e00\u4e2a\u884c\u52a8\u7684\u7edd\u5bf9\u597d\u574f\uff0c\u800c\u53ea\u9700\u8981\u77e5\u9053\u5b83\u76f8\u5bf9\u4e8e\u5e73\u5747\u6c34\u5e73\u7684\u4f18\u52bf\u3002\u4e5f\u5c31\u662f\u8bf4\uff0c\u6211\u4eec\u53ea\u60f3\u77e5\u9053\u4e00\u4e2a\u884c\u52a8\u7684\u76f8\u5bf9&nbsp;<strong>\u4f18\u52bf<\/strong>&nbsp;\u3002\u8fd9\u5c31\u662f\u4f18\u52bf\u51fd\u6570\u7684\u6982\u5ff5\u3002<\/p>\n\n\n\n<p>\u4e00\u4e2a\u670d\u4ece\u7b56\u7565&nbsp;<img src=\"https:\/\/spinningup.readthedocs.io\/zh-cn\/latest\/_images\/math\/6abd2b6bd2a10c499ace309d93cbfc9a48fbb708.svg\" alt=\"\\pi\">&nbsp;\u7684\u4f18\u52bf\u51fd\u6570\uff0c\u63cf\u8ff0\u7684\u662f\u5b83\u5728\u72b6\u6001&nbsp;<img src=\"https:\/\/spinningup.readthedocs.io\/zh-cn\/latest\/_images\/math\/5ecb694c8b2755909226b2d74b8b998d9b4e6148.svg\" alt=\"s\">&nbsp;\u4e0b\u91c7\u53d6\u884c\u4e3a&nbsp;<img src=\"https:\/\/spinningup.readthedocs.io\/zh-cn\/latest\/_images\/math\/7299c243b08052a2a26e53de560e7002cb31b38f.svg\" alt=\"a\">&nbsp;\u6bd4\u968f\u673a\u9009\u62e9\u4e00\u4e2a\u884c\u4e3a\u597d\u591a\u5c11\uff08\u5047\u8bbe\u4e4b\u540e\u4e00\u76f4\u670d\u4ece\u7b56\u7565&nbsp;<img src=\"https:\/\/spinningup.readthedocs.io\/zh-cn\/latest\/_images\/math\/6abd2b6bd2a10c499ace309d93cbfc9a48fbb708.svg\" alt=\"\\pi\">&nbsp;\uff09\u3002\u6570\u5b66\u89d2\u5ea6\u4e0a\uff0c\u4f18\u52bf\u51fd\u6570\u7684\u5b9a\u4e49\u4e3a\uff1a<\/p>\n\n\n\n<div class=\"wp-block-image\"><figure class=\"aligncenter size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-97.png\" alt=\"\" class=\"wp-image-25271\" width=\"230\" height=\"36\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-97.png 386w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-97-300x47.png 300w\" sizes=\"(max-width: 230px) 100vw, 230px\" \/><\/figure><\/div>\n\n\n\n<p><strong>\u957f\u671f\u4ef7\u503c\u53ef\u4ee5\u8868\u793a\u4e3a\u72b6\u6001\u503c\u51fd\u6570\uff08State Value Function\uff09\u6216\u52a8\u4f5c\u503c\u51fd\u6570\uff08Action Value Function\uff09\u3002<\/strong><\/p>\n\n\n\n<p><strong>\u4f18\u5316\u65b9\u6cd5\uff1a<\/strong><\/p>\n\n\n\n<p>\u2022<strong>value-based<\/strong>\uff1a\u4f18\u5316\u72b6\u6001\u52a8\u4f5c\u503c\u51fd\u6570Q(s) \uff0c\u8d2a\u5fc3\u9009\u62e9\uff08\u786e\u5b9a\u6027\u7b56\u7565\uff09 \uff1aQ-Learning<\/p>\n\n\n\n<p>\u2022<strong>policy-based<\/strong> \uff1a\u76f4\u63a5\u4f18\u5316\u7b56\u7565\u51fd\u6570\u03c0(s, a)\uff0c\u6309\u6982\u7387\u91c7\u6837\uff08\u968f\u673a\u6027\u7b56\u7565\uff09 \uff1aREINFORCE<\/p>\n\n\n\n<p>\u2022<a href=\"https:\/\/hrl.boyuai.com\/chapter\/2\/actor-critic%E7%AE%97%E6%B3%95\/\" target=\"_blank\" rel=\"noreferrer noopener\"><strong>Actor-Critic<\/strong><\/a> \u2022\u878d\u5408\u4e0a\u8ff0\u65b9\u6cd5\uff0c\u540c\u65f6\u4f18\u5316Q\u4e0e\u03c0\uff1aTRPO\u3001PPO<\/p>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-98.png\" alt=\"\" class=\"wp-image-25279\" width=\"643\" height=\"291\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-98.png 674w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-98-300x136.png 300w\" sizes=\"(max-width: 643px) 100vw, 643px\" \/><\/figure>\n\n\n\n<h3><a href=\"https:\/\/zhuanlan.zhihu.com\/p\/677607581\" target=\"_blank\" rel=\"noreferrer noopener\">NLP\u4e2d\u7684\u5f3a\u5316\u5b66\u4e60<\/a><\/h3>\n\n\n\n<p>\u5728\u7b2c\u4e00\u90e8\u5206\u4ecb\u7ecd\u4e86\u901a\u7528\u5f3a\u5316\u5b66\u4e60\u7684\u6d41\u7a0b\uff0c\u90a3\u4e48\u6211\u4eec\u8981\u600e\u4e48\u628a\u8fd9\u4e2a\u6d41\u7a0b\u5bf9\u5e94\u5230NLP\u4efb\u52a1\u4e2d\u5462\uff1f<strong>\u6362\u53e5\u8bdd\u8bf4\uff0cNLP\u4efb\u52a1\u4e2d\u7684\u667a\u80fd\u4f53\u3001\u73af\u5883\u3001\u72b6\u6001\u3001\u52a8\u4f5c\u7b49\u7b49\uff0c\u90fd\u662f\u6307\u4ec0\u4e48\u5462\uff1f<\/strong><\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"754\" height=\"502\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-107.png\" alt=\"\" class=\"wp-image-25309\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-107.png 754w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-107-300x200.png 300w\" sizes=\"(max-width: 754px) 100vw, 754px\" \/><\/figure>\n\n\n\n<p>\u56de\u60f3\u4e00\u4e0b\u6211\u4eec\u5bf9NLP\u4efb\u52a1\u505a\u5f3a\u5316\u5b66\u4e60\uff08RLHF\uff09\u7684\u76ee\u7684\uff1a<strong>\u6211\u4eec\u5e0c\u671b\u7ed9\u6a21\u578b\u4e00\u4e2aprompt\uff0c\u8ba9\u6a21\u578b\u80fd\u751f\u6210\u7b26\u5408\u4eba\u7c7b\u559c\u597d\u7684response<\/strong>\u3002\u518d\u56de\u60f3\u4e00\u4e0bgpt\u6a21\u578b\u505a\u63a8\u7406\u7684\u8fc7\u7a0b\uff1a<strong>\u6bcf\u4e2a\u65f6\u523b<\/strong>&nbsp;t&nbsp;<strong>\u53ea\u4ea7\u751f\u4e00\u4e2atoken\uff0c\u5373token\u662f\u4e00\u4e2a\u4e00\u4e2a\u8e66\u51fa\u6765\u7684\uff0c\u5148\u6709\u4e0a\u4e00\u4e2atoken\uff0c\u518d\u6709\u4e0b\u4e00\u4e2atoken\u3002<\/strong><br><br><br>\u590d\u4e60\u4e86\u8fd9\u4e24\u70b9\uff0c\u73b0\u5728\u6211\u4eec\u53ef\u4ee5\u66f4\u597d\u89e3\u8bfb\u4e0a\u9762\u8fd9\u5f20\u56fe\u4e86\uff1a<\/p>\n\n\n\n<ul><li>\u6211\u4eec\u5148\u5582\u7ed9\u6a21\u578b\u4e00\u4e2aprompt\uff0c\u671f\u671b\u5b83\u80fd\u4ea7\u51fa\u7b26\u5408\u4eba\u7c7b\u559c\u597d\u7684response<\/li><li>\u5728&nbsp;t&nbsp;\u65f6\u523b\uff0c\u6a21\u578b\u6839\u636e\u4e0a\u6587\uff0c\u4ea7\u51fa\u4e00\u4e2atoken\uff0c<strong>\u8fd9\u4e2atoken\u5373\u5bf9\u5e94\u7740\u5f3a\u5316\u5b66\u4e60\u4e2d\u7684\u52a8\u4f5c\uff0c\u6211\u4eec\u8bb0\u4e3a<\/strong>At&nbsp;\u3002\u56e0\u6b64\u4e0d\u96be\u7406\u89e3\uff0c\u5728NLP\u8bed\u5883\u4e0b\uff0c\u5f3a\u5316\u5b66\u4e60\u4efb\u52a1\u7684\u52a8\u4f5c\u7a7a\u95f4\u5c31\u5bf9\u5e94\u7740\u8bcd\u8868\u3002<\/li><li>\u5728&nbsp;t&nbsp;\u65f6\u523b\uff0c<strong>\u6a21\u578b\u4ea7\u51fatoken&nbsp;<\/strong>At<strong>\u5bf9\u5e94\u7740\u7684\u5373\u65f6\u6536\u76ca\u4e3a<\/strong>Rt<strong>\uff0c\u603b\u6536\u76ca\u4e3a<\/strong>Vt<strong>\uff08<\/strong>\u590d\u4e60\u4e00\u4e0b\uff0c&nbsp;Vt&nbsp;\u8574\u542b\u7740\u201c\u5373\u65f6\u6536\u76ca\u201d\u4e0e\u201c\u672a\u6765\u6536\u76ca\u201d\u4e24\u4e2a\u5185\u5bb9\uff09\u3002\u8fd9\u4e2a\u6536\u76ca\u5373\u53ef\u4ee5\u7406\u89e3\u4e3a\u201c<strong>\u5bf9\u4eba\u7c7b\u559c\u597d\u7684\u8861\u91cf<\/strong>\u201d\u3002\u6b64\u523b\uff0c<strong>\u6a21\u578b\u7684\u72b6\u6001\u4ece<\/strong>St<strong>\u53d8\u4e3a<\/strong>St+1<strong>\uff0c\u4e5f\u5c31\u662f\u4ece\u201c\u4e0a\u6587\u201d\u53d8\u6210\u201c\u4e0a\u6587 + \u65b0\u4ea7\u51fa\u7684token\u201d<\/strong><\/li><li>\u5728NLP\u8bed\u5883\u4e0b\uff0c\u667a\u80fd\u4f53\u662f\u8bed\u8a00\u6a21\u578b\u672c\u8eab\uff0c\u73af\u5883\u5219\u5bf9\u5e94\u7740\u5b83\u4ea7\u51fa\u7684\u8bed\u6599<\/li><\/ul>\n\n\n\n<p>\u8fd9\u6837\uff0c\u6211\u4eec\u5c31\u5927\u81f4\u89e3\u91ca\u4e86NLP\u8bed\u5883\u4e0b\u7684\u5f3a\u5316\u5b66\u4e60\u6846\u67b6\uff0c\u4e0d\u8fc7\u9488\u5bf9\u4e0a\u9762\u8fd9\u5f20\u56fe\uff0c\u4f60\u53ef\u80fd\u8fd8\u6709\u4ee5\u4e0b\u95ee\u9898\uff1a<\/p>\n\n\n\n<p><br><strong>\uff081\uff09\u95ee\u98981\uff1a\u56fe\u4e2d\u7684\u4e0b\u6807\u662f\u4e0d\u662f\u5199\u5f97\u4e0d\u592a\u5bf9\uff1f\u4f8b\u5982\u6839\u636e\u7b2c\u4e00\u90e8\u5206\u7684\u4ecb\u7ecd\uff0c<\/strong>&nbsp;At&nbsp;<strong>\u5e94\u8be5\u5bf9\u5e94\u7740<\/strong>&nbsp;Rt+1&nbsp;<strong>\uff0c<\/strong>&nbsp;At+1&nbsp;<strong>\u5e94\u8be5\u5bf9\u5e94\u7740<\/strong>&nbsp;Rt+2&nbsp;<strong>\uff0c\u4ee5\u6b64\u7c7b\u63a8\uff1f<\/strong><br>\u7b54\uff1a\u4f60\u8bf4\u7684\u5bf9\u3002\u4f46\u8fd9\u91cc\u6211\u4eec\u4e0d\u7528\u592a\u7ea0\u7ed3\u4e0b\u6807\u7684\u95ee\u9898\uff0c\u53ea\u9700\u8981\u8bb0\u4f4f\u5728\u5bf9\u5e94\u7684response token\u4f4d\u7f6e\uff0c\u4f1a\u4ea7\u751f\u76f8\u5e94\u7684\u5373\u65f6\u5956\u52b1\u548c\u603b\u6536\u76ca\u5373\u53ef\u3002\u4e4b\u6240\u4ee5\u7528\u56fe\u4e2d\u8fd9\u6837\u7684\u4e0b\u6807\uff0c\u662f\u66f4\u65b9\u4fbf\u6211\u4eec\u540e\u7eed\u7406\u89e3\u4ee3\u7801\u3002<br><br><br><strong>\uff082\uff09\u95ee\u98982\uff1a\u6211\u77e5\u9053<\/strong>&nbsp;At&nbsp;<strong>\u80af\u5b9a\u662f\u7531\u8bed\u8a00\u6a21\u578b\u4ea7\u751f\u7684\uff0c\u90a3\u4e48<\/strong>&nbsp;\uff0cRt\uff0cVt&nbsp;<strong>\u662f\u600e\u4e48\u6765\u7684\u5462\uff0c\u4e5f\u662f\u8bed\u8a00\u6a21\u578b\u4ea7\u751f\u7684\u5417\uff1f<\/strong><br>\u7b54\uff1a\u5148\u76f4\u63a5\u8bf4\u7ed3\u8bba\uff0c&nbsp;At&nbsp;\u662f\u7531\u6211\u4eec\u7684\u8bed\u8a00\u6a21\u578b\u4ea7\u751f\u7684\uff0c&nbsp;\uff0cRt\uff0cVt&nbsp;\u5219\u5206\u522b\u7531\u53e6\u5916\u4e24\u4e2a\u6a21\u578b\u6765\u4ea7\u751f\uff0c\u5728\u540e\u6587\u4e2d\u6211\u4eec\u4f1a\u7ec6\u8bf4\u3002<br><br><br><strong>\uff083\uff09\u95ee\u98983\uff1a\u8bed\u8a00\u6a21\u578b\u7684\u53c2\u6570\u5728\u4ec0\u4e48\u65f6\u5019\u66f4\u65b0\uff1f\u662f\u89c2\u6d4b\u5230\u4e00\u4e2a<\/strong>&nbsp;Rt,Vt&nbsp;<strong>\uff0c\u5c31\u66f4\u65b0\u4e00\u6b21\u53c2\u6570\uff0c\u7136\u540e\u518d\u53bb\u4ea7\u751f<\/strong>&nbsp;At+1&nbsp;<strong>\u5417\uff1f<\/strong><br>\u7b54\uff1a\u5f53\u7136\u4e0d\u662f\u3002\u4f60\u53ea\u770b\u5230\u67d0\u4e2a\u65f6\u523b\u7684\u6536\u76ca\uff0c\u5c31\u6025\u7740\u7528\u5b83\u66f4\u65b0\u6a21\u578b\uff0c\u8fd9\u4e5f\u592a\u83bd\u649e\u4e86\u3002\u6211\u4eec\u80af\u5b9a\u662f\u8981\u7b49\u6709\u8db3\u591f\u7684\u89c2\u6d4b\u6570\u636e\u4e86\uff08\u4f8b\u5982\u7b49\u6a21\u578b\u628a\u5b8c\u6574\u7684response\u751f\u6210\u5b8c\uff09\uff0c\u518d\u53bb\u66f4\u65b0\u5b83\u7684\u53c2\u6570\u3002\u8fd9\u4e00\u70b9\u6211\u4eec\u4e5f\u653e\u5728\u540e\u6587\u7ec6\u8bf4\u3002<br><br><br><strong>\uff084\uff09\u95ee\u98984\uff1a\u518d\u8c08\u8c08<\/strong>&nbsp;Rt,Vt&nbsp;<strong>\u5427\uff0c\u5728NLP\u7684\u8bed\u5883\u4e0b\u6211\u8fd8\u662f\u4e0d\u592a\u7406\u89e3\u5b83\u4eec<\/strong><br>\u7b54\uff1a<\/p>\n\n\n\n<ul><li>\u9996\u5148\uff0c\u201c\u6536\u76ca\u201d\u7684\u542b\u4e49\u662f\u201c\u5bf9\u4eba\u7c7b\u559c\u597d\u7684\u8861\u91cf\u201d<\/li><li>Rt&nbsp;\uff1a\u5373\u65f6\u6536\u76ca\uff0c\u6307\u8bed\u8a00\u6a21\u578b\u5f53\u4e0b\u4ea7\u751ftoken&nbsp;At&nbsp;\u5e26\u6765\u7684\u6536\u76ca<\/li><li>Vt&nbsp;\uff1a \u5b9e\u9645\u671f\u671b\u603b\u6536\u76ca\uff08\u5373\u65f6+\u672a\u6765\uff09\uff0c\u6307\u5bf9\u8bed\u8a00\u6a21\u578b\u201c\u5f53\u4e0b\u4ea7\u751ftoken&nbsp;At&nbsp;\uff0c\u4e00\u76f4\u5230\u6574\u4e2aresponse\u751f\u4ea7\u7ed3\u675f\u201d\u540e\u7684\u671f\u6536\u76ca\u9884\u4f30\u3002\u56e0\u4e3a\u5f53\u4e0b\u8bed\u8a00\u6a21\u578b\u8fd8\u6ca1\u4ea7\u51fa&nbsp;At&nbsp;\u540e\u7684token\uff0c\u6240\u4ee5\u6211\u4eec\u53ea\u662f\u5bf9\u5b83\u4e4b\u540e\u4e00\u7cfb\u5217\u52a8\u4f5c\u7684\u6536\u76ca\u505a\u4e86\u4f30\u8ba1\uff0c\u56e0\u800c\u79f0\u4e3a\u201c\u671f\u671b\u603b\u6536\u76ca\u201d\u3002<\/li><\/ul>\n\n\n\n<h3><a href=\"https:\/\/zhuanlan.zhihu.com\/p\/677607581\" target=\"_blank\" rel=\"noreferrer noopener\">RLHF\u4e2d\u7684\u56db\u4e2a\u91cd\u8981\u89d2\u8272\uff1a<\/a><\/h3>\n\n\n\n<p>\u6211\u4eec\u4ece\u7b2c\u4e8c\u90e8\u5206\u4e2d\u5df2\u7ecf\u77e5\u9053\uff1a\u751f\u6210token&nbsp;At&nbsp;\u548c\u5bf9\u5e94\u6536\u76ca&nbsp;Rt,Vt&nbsp;\u7684\u5e76\u4e0d\u662f\u4e00\u4e2a\u6a21\u578b\u3002\u90a3\u4e48\u5728RLHF\u4e2d\u5230\u5e95\u6709\u51e0\u4e2a\u6a21\u578b\uff1f\u4ed6\u4eec\u662f\u600e\u4e48\u914d\u5408\u505a\u8bad\u7ec3\u7684\uff1f\u800c\u6211\u4eec\u6700\u7ec8\u8981\u7684\u662f\u54ea\u4e2a\u6a21\u578b\uff1f<\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"820\" height=\"847\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-108.png\" alt=\"\" class=\"wp-image-25313\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-108.png 820w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-108-290x300.png 290w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-108-768x793.png 768w\" sizes=\"(max-width: 820px) 100vw, 820px\" \/><\/figure>\n\n\n\n<p>\u5982\u4e0a\u56fe\uff0c<strong>\u5728RLHF-PPO\u9636\u6bb5\uff0c\u4e00\u5171\u6709\u56db\u4e2a\u4e3b\u8981\u6a21\u578b<\/strong>\uff0c\u5206\u522b\u662f\uff1a<\/p>\n\n\n\n<ul><li><strong>Actor Model\uff1a\u6f14\u5458\u6a21\u578b<\/strong>\uff0c\u8fd9\u5c31\u662f\u6211\u4eec\u60f3\u8981\u8bad\u7ec3\u7684\u76ee\u6807\u8bed\u8a00\u6a21\u578b<\/li><li><strong>Critic Model\uff1a\u8bc4\u8bba\u5bb6\u6a21\u578b<\/strong>\uff0c\u5b83\u7684\u4f5c\u7528\u662f\u9884\u4f30\u603b\u6536\u76ca&nbsp;<strong>Vt<\/strong><\/li><li><strong>Reward Model\uff1a\u5956\u52b1\u6a21\u578b<\/strong>\uff0c\u5b83\u7684\u4f5c\u7528\u662f\u8ba1\u7b97\u5373\u65f6\u6536\u76ca<strong>&nbsp;Rt<\/strong><\/li><li><strong>Reference Model\uff1a\u53c2\u8003\u6a21\u578b<\/strong>\uff0c\u5b83\u7684\u4f5c\u7528\u662f\u5728RLHF\u9636\u6bb5\u7ed9\u8bed\u8a00\u6a21\u578b\u589e\u52a0\u4e00\u4e9b\u201c\u7ea6\u675f\u201d\uff0c\u9632\u6b62\u8bed\u8a00\u6a21\u578b\u8bad\u6b6a\uff08\u671d\u4e0d\u53d7\u63a7\u5236\u7684\u65b9\u5411\u66f4\u65b0\uff0c\u6548\u679c\u53ef\u80fd\u8d8a\u6765\u8d8a\u5dee\uff09<\/li><\/ul>\n\n\n\n<p>\u5176\u4e2d:<\/p>\n\n\n\n<ul><li><strong>Actor\/Critic Model<\/strong>\u5728RLHF\u9636\u6bb5\u662f<strong>\u9700\u8981\u8bad\u7ec3<\/strong>\u7684\uff08\u56fe\u4e2d\u7ed9\u8fd9\u4e24\u4e2a\u6a21\u578b\u52a0\u4e86\u7c97\u8fb9\uff0c\u5c31\u662f\u8868\u793a\u8fd9\u4e2a\u542b\u4e49\uff09\uff1b\u800c<strong>Reward\/Reference Model<\/strong>\u662f<strong>\u53c2\u6570\u51bb\u7ed3<\/strong>\u7684\u3002<\/li><li>Critic\/Reward\/Reference Model\u5171\u540c\u7ec4\u6210\u4e86\u4e00\u4e2a\u201c\u5956\u52b1-loss\u201d\u8ba1\u7b97\u4f53\u7cfb\uff08\u6211\u81ea\u5df1\u547d\u540d\u7684\uff0c\u4e3a\u4e86\u65b9\u4fbf\u7406\u89e3\uff09\uff0c\u6211\u4eec\u7efc\u5408\u5b83\u4eec\u7684\u7ed3\u679c\u8ba1\u7b97loss\uff0c\u7528\u4e8e\u66f4\u65b0Actor\u548cCritic Model<\/li><\/ul>\n\n\n\n<h4 id=\"h_677607581_5\">Actor Model (\u6f14\u5458\u6a21\u578b)<\/h4>\n\n\n\n<p>\u6b63\u5982\u524d\u6587\u6240\u8bf4\uff0c<strong>Actor\u5c31\u662f\u6211\u4eec\u60f3\u8981\u8bad\u7ec3\u7684\u76ee\u6807\u8bed\u8a00\u6a21\u578b\u3002\u6211\u4eec\u4e00\u822c\u7528SFT\u9636\u6bb5\u4ea7\u51fa\u7684SFT\u6a21\u578b\u6765\u5bf9\u5b83\u505a\u521d\u59cb\u5316\u3002<\/strong><\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"541\" height=\"450\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-109.png\" alt=\"\" class=\"wp-image-25317\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-109.png 541w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-109-300x250.png 300w\" sizes=\"(max-width: 541px) 100vw, 541px\" \/><\/figure>\n\n\n\n<p>\u6211\u4eec\u7684\u6700\u7ec8\u76ee\u7684\u662f\u8ba9Actor\u6a21\u578b\u80fd\u4ea7\u751f\u7b26\u5408\u4eba\u7c7b\u559c\u597d\u7684response\u3002\u6240\u4ee5\u6211\u4eec\u7684\u7b56\u7565\u662f\uff0c\u5148\u5582\u7ed9Actor\u4e00\u6761prompt \uff08\u8fd9\u91cc\u5047\u8bbebatch_size = 1\uff0c\u6240\u4ee5\u662f1\u6761prompt\uff09\uff0c\u8ba9\u5b83\u751f\u6210\u5bf9\u5e94\u7684response\u3002\u7136\u540e\uff0c\u6211\u4eec\u518d\u5c06\u201cprompt + response&#8221;\u9001\u5165\u6211\u4eec\u7684\u201c\u5956\u52b1-loss\u201d\u8ba1\u7b97\u4f53\u7cfb\u4e2d\u53bb\u7b97\u5f97\u6700\u540e\u7684loss\uff0c\u7528\u4e8e\u66f4\u65b0actor\u3002<\/p>\n\n\n\n<h4 id=\"h_677607581_6\">Reference Model\uff08\u53c2\u8003\u6a21\u578b\uff09<\/h4>\n\n\n\n<p><strong>Reference Model\uff08\u4ee5\u4e0b\u7b80\u79f0Ref\u6a21\u578b\uff09\u4e00\u822c\u4e5f\u7528SFT\u9636\u6bb5\u5f97\u5230\u7684SFT\u6a21\u578b\u505a\u521d\u59cb\u5316\uff0c\u5728\u8bad\u7ec3\u8fc7\u7a0b\u4e2d\uff0c\u5b83\u7684\u53c2\u6570\u662f\u51bb\u7ed3\u7684\u3002Ref\u6a21\u578b\u7684\u4e3b\u8981\u4f5c\u7528\u662f\u9632\u6b62Actor\u201d\u8bad\u6b6a\u201d\uff0c\u90a3\u4e48\u5b83\u5177\u4f53\u662f\u600e\u4e48\u505a\u5230\u8fd9\u4e00\u70b9\u7684\u5462\uff1f<\/strong><\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"1024\" height=\"468\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-110-1024x468.png\" alt=\"\" class=\"wp-image-25319\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-110-1024x468.png 1024w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-110-300x137.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-110-768x351.png 768w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-110.png 1327w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p>\u201c\u9632\u6b62\u6a21\u578b\u8bad\u6b6a\u201d\u6362\u4e00\u4e2a\u66f4\u8be6\u7ec6\u7684\u89e3\u91ca\u662f\uff1a<strong>\u6211\u4eec\u5e0c\u671b\u8bad\u7ec3\u51fa\u6765\u7684Actor\u6a21\u578b\u65e2\u80fd\u8fbe\u5230\u7b26\u5408\u4eba\u7c7b\u559c\u597d\u7684\u76ee\u7684\uff0c\u53c8\u5c3d\u91cf\u8ba9\u5b83\u548cSFT\u6a21\u578b\u4e0d\u8981\u5dee\u5f02\u592a\u5927<\/strong>\u3002\u7b80\u8a00\u4e4b\uff0c<strong>\u6211\u4eec\u5e0c\u671b\u4e24\u4e2a\u6a21\u578b\u7684\u8f93\u51fa\u5206\u5e03\u5c3d\u91cf\u76f8\u4f3c<\/strong>\u3002\u90a3\u4ec0\u4e48\u6307\u6807\u80fd\u7528\u6765\u8861\u91cf\u8f93\u51fa\u5206\u5e03\u7684\u76f8\u4f3c\u5ea6\u5462\uff1f\u6211\u4eec\u81ea\u7136\u800c\u7136\u60f3\u5230\u4e86<strong>KL\u6563\u5ea6<\/strong>\u3002<\/p>\n\n\n\n<p>\u5982\u56fe\u6240\u793a\uff1a<\/p>\n\n\n\n<ul><li><strong>\u5bf9Actor\u6a21\u578b<\/strong>\uff0c\u6211\u4eec\u5582\u7ed9\u5b83\u4e00\u4e2aprompt\uff0c\u5b83\u6b63\u5e38\u8f93\u51fa\u5bf9\u5e94\u7684response\u3002\u90a3\u4e48response\u4e2d\u6bcf\u4e00\u4e2atoken\u80af\u5b9a\u6709\u5b83\u5bf9\u5e94\u7684log_prob\u7ed3\u679c\u5440\uff0c\u6211\u4eec\u628a\u8fd9\u6837\u7684\u7ed3\u679c\u8bb0\u4e3a<strong>log_probs<\/strong><\/li><li><strong>\u5bf9Ref\u6a21\u578b<\/strong>\uff0c\u6211\u4eec\u628aActor\u751f\u6210\u7684&#8221;prompt + response&#8221;\u5582\u7ed9\u5b83\uff0c\u90a3\u4e48\u5b83\u540c\u6837\u80fd\u7ed9\u51fa\u6bcf\u4e2atoken\u7684log_prob\u7ed3\u679c\uff0c\u6211\u4eec\u8bb0\u5176\u4e3a<strong>ref_log_probs<\/strong><\/li><li>\u90a3\u4e48\u8fd9\u4e24\u4e2a\u6a21\u578b\u7684\u8f93\u51fa\u5206\u5e03\u76f8\u4f3c\u5ea6\u5c31\u53ef\u4ee5\u7528<strong><code>ref_log_probs - log_probs<\/code><\/strong>\u6765\u8861\u91cf\uff0c\u6211\u4eec\u53ef\u4ee5\u4ece\u4e24\u4e2a\u65b9\u9762\u6765\u7406\u89e3\u8fd9\u4e2a\u516c\u5f0f\uff1a<ul><li><strong>\u4ece\u76f4\u89c9\u4e0a\u7406\u89e3<\/strong>\uff0c\u4e24\u4e2a\u5206\u5e03\u7684\u76f8\u4f3c\u5ea6\u8d8a\u9ad8\uff0c\u8bf4\u660eRef\u6a21\u578b\u5bf9Actor\u6a21\u578b\u8f93\u51fa\u7684\u80af\u5b9a\u6027\u8d8a\u5927\u3002\u5373Ref\u6a21\u578b\u4e5f\u8ba4\u4e3a\uff0c\u5bf9\u4e8e\u67d0\u4e2a&nbsp;St&nbsp;\uff0c\u8f93\u51fa\u67d0\u4e2a&nbsp;At&nbsp;\u7684\u6982\u7387\u4e5f\u5f88\u9ad8\uff08&nbsp;P(At|St)&nbsp;\uff09\u3002\u8fd9\u65f6\u53ef\u4ee5\u8ba4\u4e3aActor\u6a21\u578b\u8f83Ref\u6a21\u578b\u6ca1\u6709\u8bad\u6b6a\u3002<\/li><li><strong>\u4eceKL\u6563\u5ea6\u4e0a\u7406\u89e3<\/strong>\uff1a<\/li><\/ul><\/li><\/ul>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-111.png\" alt=\"\" class=\"wp-image-25326\" width=\"531\" height=\"57\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-111.png 747w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-111-300x33.png 300w\" sizes=\"(max-width: 531px) 100vw, 531px\" \/><\/figure>\n\n\n\n<p>\uff08\u5f53\u7136\u8fd9\u91cc\u4e0d\u662f\u4e25\u683c\u7684\u7b49\u4e8e\uff0c\u53ea\u662fKL\u6563\u5ea6\u7684\u8fd1\u4f3c\uff09\uff0c\u8fd9\u4e2a\u503c\u8d8a\u5c0f\u610f\u5473\u7740\u4e24\u4e2a\u5206\u5e03\u7684\u76f8\u4f3c\u6027\u8d8a\u9ad8\u3002<\/p>\n\n\n\n<p>\u6ce8\uff1a\u4f60\u53ef\u80fd\u5df2\u7ecf\u6ce8\u610f\u5230\uff0c\u6309\u7167KL\u6563\u5ea6\u7684\u5b9a\u4e49\uff0c\u8fd9\u91cc\u5199\u6210<code>log_probs - ref_log_probs<\/code>\u66f4\u5408\u9002\u4e00\u4e9b\u3002\u4f46\u662f\u5982\u679c\u4f60\u770b\u8fc7\u4e00\u4e9brlhf\u76f8\u5173\u7684\u8bba\u6587\u7684\u8bdd\uff0c\u4f60\u53ef\u80fd\u8bb0\u5f97\u5728\u8ba1\u7b97\u635f\u5931\u51fd\u6570\u65f6\uff0c\u6709\u4e00\u9879Rt\u2212KL\u6563\u5ea6&nbsp;\uff08\u5bf9\u8fd9\u4e2a\u6709\u7591\u60d1\u4e0d\u8981\u7d27\uff0c\u6211\u4eec\u9a6c\u4e0a\u5728\u540e\u6587\u7ec6\u8bf4\uff09\uff0c\u5373KL\u6563\u5ea6\u524d\u5e26\u4e86\u8d1f\u53f7\uff0c\u6240\u4ee5\u8fd9\u91cc\u6211\u5199\u6210<code>ref_log_probs - log_probs<\/code>\u8fd9\u6837\u7684\u5f62\u5f0f\uff0c\u66f4\u65b9\u4fbf\u5927\u5bb6\u4ece\u76f4\u89c9\u4e0a\u7406\u89e3\u8fd9\u4e2a\u516c\u5f0f\u3002<\/p>\n\n\n\n<p>\u73b0\u5728\uff0c\u6211\u4eec\u5df2\u7ecf\u77e5\u9053\u600e\u4e48\u5229\u7528Ref\u6a21\u578b\u548cKL\u6563\u5ea6\u6765\u9632\u6b62Actor\u8bad\u6b6a\u4e86\u3002<strong>KL\u6563\u5ea6\u5c06\u5728\u540e\u7eed\u88ab\u7528\u4e8eloss\u7684\u8ba1\u7b97<\/strong>\uff0c\u6211\u4eec\u5728\u540e\u6587\u4e2d\u4f1a\u8be6\u7ec6\u89e3\u91ca\u3002<\/p>\n\n\n\n<h4 id=\"h_677607581_7\">Critic Model\uff08\u8bc4\u8bba\u5bb6\u6a21\u578b\uff09<\/h4>\n\n\n\n<p><strong>Critic Model\u7528\u4e8e\u9884\u6d4b\u671f\u671b\u603b\u6536\u76ca<\/strong>&nbsp;Vt&nbsp;<strong>\uff0c\u548cActor\u6a21\u578b\u4e00\u6837\uff0c\u5b83\u9700\u8981\u505a\u53c2\u6570\u66f4\u65b0<\/strong>\u3002\u5b9e\u8df5\u4e2d\uff0cCritic Model\u7684\u8bbe\u8ba1\u548c\u521d\u59cb\u5316\u65b9\u5f0f\u4e5f\u6709\u5f88\u591a\u79cd\uff0c\u4f8b\u5982\u548cActor\u5171\u4eab\u90e8\u5206\u53c2\u6570\u3001\u4eceRW\u9636\u6bb5\u7684Reward Model\u521d\u59cb\u5316\u800c\u6765\u7b49\u7b49\u3002\u6211\u4eec\u8bb2\u89e3\u65f6\uff0c\u548cdeepspeed-chat\u7684\u5b9e\u73b0\u4fdd\u6301\u4e00\u81f4\uff1a\u4eceRW\u9636\u6bb5\u7684Reward Model\u521d\u59cb\u5316\u800c\u6765\u3002<br><br><br><strong>\u4f60\u53ef\u80fd\u60f3\u95ee\uff1a\u8bad\u7ec3Actor\u6a21\u578b\u6211\u80fd\u7406\u89e3\uff0c\u4f46\u6211\u8fd8\u662f\u4e0d\u660e\u767d\uff0c\u4e3a\u4ec0\u4e48\u8981\u5355\u72ec\u8bad\u7ec3\u4e00\u4e2aCritic\u6a21\u578b\u7528\u4e8e\u9884\u6d4b\u6536\u76ca\u5462\uff1f<\/strong><br>\u8fd9\u662f\u56e0\u4e3a\uff0c\u5f53\u6211\u4eec\u5728\u524d\u6587\u8ba8\u8bba\u603b\u6536\u76ca&nbsp;Vt&nbsp;\uff08\u5373\u65f6 + \u672a\u6765\uff09\u65f6\uff0c\u6211\u4eec\u662f\u7ad9\u5728\u4e0a\u5e1d\u89c6\u89d2\u7684\uff0c\u4e5f\u5c31\u662f\u8fd9\u4e2a&nbsp;Vt&nbsp;\u5c31\u662f\u5ba2\u89c2\u5b58\u5728\u7684\u3001\u771f\u6b63\u7684\u603b\u6536\u76ca\u3002\u4f46\u662f\u6211\u4eec\u5728\u8bad\u7ec3\u6a21\u578b\u65f6\uff0c\u5c31\u6ca1\u6709\u8fd9\u4e2a\u4e0a\u5e1d\u89c6\u89d2\u52a0\u6210\u4e86\uff0c<strong>\u4e5f\u5c31\u662f\u5728<\/strong>&nbsp;t&nbsp;<strong>\u65f6\u523b\uff0c\u6211\u4eec\u7ed9\u4e0d\u51fa\u5ba2\u89c2\u5b58\u5728\u7684\u603b\u6536\u76ca<\/strong>&nbsp;Vt&nbsp;<strong>\uff0c\u6211\u4eec\u53ea\u80fd\u8bad\u7ec3\u4e00\u4e2a\u6a21\u578b\u53bb\u9884\u6d4b\u5b83\u3002<\/strong><br><br><br><strong>\u6240\u4ee5\u603b\u7ed3\u6765\u8bf4\uff0c\u5728RLHF\u4e2d\uff0c\u6211\u4eec\u4e0d\u4ec5\u8981\u8bad\u7ec3\u6a21\u578b\u751f\u6210\u7b26\u5408\u4eba\u7c7b\u559c\u597d\u7684\u5185\u5bb9\u7684\u80fd\u529b\uff08Actor\uff09\uff0c\u4e5f\u8981\u63d0\u5347\u6a21\u578b\u5bf9\u4eba\u7c7b\u559c\u597d\u91cf\u5316\u5224\u65ad\u7684\u80fd\u529b\uff08Critic\uff09<\/strong>\u3002\u8fd9\u5c31\u662fCritic\u6a21\u578b\u5b58\u5728\u7684\u610f\u4e49\u3002\u6211\u4eec\u6765\u770b\u770b\u5b83\u7684\u5927\u81f4\u67b6\u6784\uff1a<\/p>\n\n\n\n<div class=\"wp-block-image\"><figure class=\"aligncenter size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-112.png\" alt=\"\" class=\"wp-image-25336\" width=\"438\" height=\"366\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-112.png 588w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-112-300x251.png 300w\" sizes=\"(max-width: 438px) 100vw, 438px\" \/><\/figure><\/div>\n\n\n\n<p>deepspeed-chat\u91c7\u7528\u4e86Reward\u6a21\u578b\u4f5c\u4e3a\u5b83\u7684\u521d\u59cb\u5316\uff0c\u6240\u4ee5\u8fd9\u91cc\u6211\u4eec\u4e5f\u6309Reward\u6a21\u578b\u7684\u67b6\u6784\u6765\u7b80\u5355\u753b\u753b\u5b83\u3002\u4f60\u53ef\u4ee5\u7b80\u5355\u7406\u89e3\u6210\uff0cReward\/Critic\u6a21\u578b\u548cActor\u6a21\u578b\u7684\u67b6\u6784\u662f\u5f88\u76f8\u4f3c\u7684\uff08\u6bd5\u7adf\u8f93\u5165\u90fd\u4e00\u6837\uff09\uff0c\u540c\u65f6\uff0c\u5b83\u5728\u6700\u540e\u4e00\u5c42\u589e\u52a0\u4e86\u4e00\u4e2aValue Head\u5c42\uff0c\u8be5\u5c42\u662f\u4e2a\u7b80\u5355\u7684\u7ebf\u5f62\u5c42\uff0c\u7528\u4e8e\u5c06\u539f\u59cb\u8f93\u51fa\u7ed3\u679c\u6620\u5c04\u6210\u5355\u4e00\u7684&nbsp;Vt&nbsp;\u503c\u3002<\/p>\n\n\n\n<p>\u5728\u56fe\u4e2d\uff0c&nbsp;Vt&nbsp;\u8868\u793aCritic\u6a21\u578b\u5bf9&nbsp;t&nbsp;\u65f6\u523b\u53ca\u672a\u6765\uff08response\u5b8c\u6210\uff09\u7684\u6536\u76ca\u9884\u4f30\u3002<\/p>\n\n\n\n<h3 id=\"h_677607581_8\">&nbsp;Reward Model\uff08\u5956\u52b1\u6a21\u578b\uff09<\/h3>\n\n\n\n<p>Reward Model\u7528\u4e8e\u8ba1\u7b97\u751f\u6210token&nbsp;At&nbsp;\u7684\u5373\u65f6\u6536\u76ca\uff0c\u5b83\u5c31\u662fRW\u9636\u6bb5\u6240\u8bad\u7ec3\u7684\u5956\u52b1\u6a21\u578b\uff0c\u5728RLHF\u8fc7\u7a0b\u4e2d\uff0c\u5b83\u7684\u53c2\u6570\u662f\u51bb\u7ed3\u7684\u3002<br><br><br><strong>\u4f60\u53ef\u80fd\u60f3\u95ee\uff1a\u4e3a\u4ec0\u4e48Critic\u6a21\u578b\u8981\u53c2\u4e0e\u8bad\u7ec3\uff0c\u800c\u540c\u6837\u662f\u548c\u6536\u76ca\u76f8\u5173\u7684Reward\u6a21\u578b\u7684\u53c2\u6570\u5c31\u53ef\u4ee5\u51bb\u7ed3\u5462\uff1f<\/strong><br>\u8fd9\u662f\u56e0\u4e3a\uff0cReward\u6a21\u578b\u662f\u7ad9\u5728\u4e0a\u5e1d\u89c6\u89d2\u7684\u3002\u8fd9\u4e2a\u4e0a\u5e1d\u89c6\u89d2\u6709\u4e24\u5c42\u542b\u4e49\uff1a<\/p>\n\n\n\n<ul><li>\u7b2c\u4e00\u70b9\uff0cReward\u6a21\u578b\u662f\u7ecf\u8fc7\u548c\u201c\u4f30\u7b97\u6536\u76ca\u201d\u76f8\u5173\u7684\u8bad\u7ec3\u7684\uff0c\u56e0\u6b64\u5728RLHF\u9636\u6bb5\u5b83\u53ef\u4ee5\u76f4\u63a5\u88ab\u5f53\u4f5c\u4e00\u4e2a\u80fd\u4ea7\u751f\u5ba2\u89c2\u503c\u7684\u6a21\u578b\u3002<\/li><li>\u7b2c\u4e8c\u70b9\uff0cReward\u6a21\u578b\u4ee3\u8868\u7684\u542b\u4e49\u5c31\u662f\u201c\u5373\u65f6\u6536\u76ca\u201d\uff0c\u4f60\u7684token&nbsp;At&nbsp;\u5df2\u7ecf\u4ea7\u751f\uff0c\u56e0\u6b64\u5373\u65f6\u6536\u76ca\u81ea\u7136\u53ef\u4ee5\u7acb\u523b\u7b97\u51fa\u3002<\/li><\/ul>\n\n\n\n<p><strong>\u4f60\u8fd8\u53ef\u80fd\u60f3\u95ee\uff1a\u6211\u5df2\u7ecf\u7528Critic\u9884\u6d4b\u51fa<\/strong>&nbsp;Vt&nbsp;<strong>\u4e86\uff0c\u800c\u8fd9\u4e2a<\/strong>&nbsp;Vt&nbsp;<strong>\u5305\u542b\u4e86\u201c\u5373\u65f6\u201d\u548c\u201c\u672a\u6765\u201d\u7684\u6982\u5ff5\uff0c\u90a3\u6211\u8fd8\u9700\u8981\u4ee3\u8868\u201c\u5373\u65f6\u201d\u7684<\/strong>&nbsp;Rt&nbsp;<strong>\u505a\u4ec0\u4e48\u5462\uff1f\u76f4\u63a5\u7528<\/strong>&nbsp;Vt&nbsp;<strong>\u4e0d\u5c31\u597d\u4e86\u5417\uff1f<\/strong><\/p>\n\n\n\n<p><br>\u4e3a\u4e86\u89e3\u7b54\u8fd9\u4e2a\u95ee\u9898\uff0c\u6211\u4eec\u5148\u56de\u987e\u4e0b1.2\u90e8\u5206\u4e2d\u7ed9\u51fa\u7684\u4ef7\u503c\u51fd\u6570\uff1a&nbsp;Vt=Rt+\u03b3Vt+1<br>\u8fd9\u4e2a\u51fd\u6570\u544a\u8bc9\u6211\u4eec\uff0c\u6211\u4eec\u5f53\u524d\u53ef\u4ee5\u7528\u4e24\u4e2a\u7ed3\u679c\u6765\u8868\u793a&nbsp;t&nbsp;\u65f6\u523b\u7684\u603b\u6536\u76ca\uff1a<\/p>\n\n\n\n<ul><li>\u7ed3\u679c1\uff1aCritic\u6a21\u578b\u9884\u6d4b\u7684&nbsp;Vt<\/li><li>\u7ed3\u679c2\uff1aReward\u6a21\u578b\u9884\u6d4b\u7684&nbsp;Rt&nbsp;\u548ccritic\u6a21\u578b\u9884\u6d4b\u7684&nbsp;Vt+1<\/li><\/ul>\n\n\n\n<p>\u90a3\u4e48\u54ea\u4e00\u4e2a\u7ed3\u679c\u66f4\u9760\u8fd1\u4e0a\u5e1d\u89c6\u89d2\u7ed9\u51fa\u7684\u5ba2\u89c2\u503c\u5462\uff1f\u5f53\u7136\u662f\u7ed3\u679c2\uff0c\u56e0\u4e3a\u7ed3\u679c1\u5168\u9760\u9884\u6d4b\uff0c\u800c\u7ed3\u679c2\u4e2d\u7684&nbsp;Rt&nbsp;\u662f\u4e8b\u5b9e\u6570\u636e\u3002<br>\u6211\u4eec\u77e5\u9053Critic\u6a21\u578b\u4e5f\u662f\u53c2\u4e0e\u53c2\u6570\u66f4\u65b0\u7684\uff0c\u6211\u4eec\u53ef\u4ee5\u7528<code>MSE(\u4e0a\u5e1d\u89c6\u89d2\u7684\u5ba2\u89c2\u6536\u76ca-Critic\u6a21\u578b\u9884\u6d4b\u7684\u6536\u76ca)<\/code>\u6765\u8861\u91cf\u5b83\u7684loss\u3002<strong>\u4f46\u662f\u4e0a\u5e1d\u89c6\u89d2\u7684\u5ba2\u89c2\u6536\u76ca\u6211\u4eec\u662f\u4e0d\u77e5\u9053\u7684\uff0c\u53ea\u80fd\u7528\u5df2\u77e5\u4e8b\u5b9e\u6570\u636e\u53bb\u903c\u8fd1\u5b83\uff0c\u6240\u4ee5\u6211\u4eec\u5c31\u7528<\/strong>&nbsp;Rt+\u03b3\u2217Vt+1&nbsp;<strong>\u6765\u505a\u8fd1\u4f3c\u3002<\/strong>\u8fd9\u5c31\u662f&nbsp;Rt,Vt&nbsp;\u540c\u65f6\u5b58\u5728\u7684\u610f\u4e49<\/p>\n\n\n\n<p>Reward\u6a21\u578b\u548ccritic\u6a21\u578b\u975e\u5e38\u76f8\u4f3c\uff0c\u8fd9\u91cc\u6211\u4eec\u5c31\u53ea\u7ed9\u51fa\u67b6\u6784\u56fe\uff0c\u4e0d\u518d\u505a\u8fc7\u591a\u7684\u8bf4\u660e\u3002<\/p>\n\n\n\n<div class=\"wp-block-image\"><figure class=\"aligncenter size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-113.png\" alt=\"\" class=\"wp-image-25342\" width=\"371\" height=\"307\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-113.png 577w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-113-300x248.png 300w\" sizes=\"(max-width: 371px) 100vw, 371px\" \/><\/figure><\/div>\n\n\n\n<h3 id=\"h_677607581_9\">RLHF\u4e2d\u7684loss\u8ba1\u7b97<\/h3>\n\n\n\n<p>\u5230\u76ee\u524d\u4e3a\u6b62\uff0c\u6211\u4eec\u5df2\u7ecf\u57fa\u672c\u4e86\u89e3\u4e86RLHF\u7684\u8bad\u7ec3\u6846\u67b6\uff0c\u4ee5\u53ca\u5176\u4e2d\u7684\u56db\u4e2a\u91cd\u8981\u89d2\u8272\uff08\u8bad\u7ec3\u4e00\u4e2aRLHF\uff0c\u67094\u4e2a\u6a21\u578b\u5728\u786c\u4ef6\u4e0a\u8dd1\uff0c\u53ef\u60f3\u800c\u77e5\u5bf9\u5b58\u50a8\u7684\u538b\u529b\uff09\u3002\u5728\u672c\u8282\u4e2d\uff0c\u6211\u4eec\u4e00\u8d77\u6765\u89e3\u8bfbRLHF\u7684loss\u8ba1\u7b97\u65b9\u5f0f\u3002\u5728\u89e3\u8bfb\u4e2d\uff0c\u6211\u4eec\u4f1a\u518d\u4e00\u6b21\u7406\u4e00\u904dRLHF\u7684\u6574\u4f53\u8bad\u7ec3\u8fc7\u7a0b\uff0c\u586b\u8865\u76f8\u5173\u7ec6\u8282\u3002\u5728\u8fd9\u4e4b\u540e\uff0c\u6211\u4eec\u5c31\u53ef\u4ee5\u6765\u770b\u4ee3\u7801\u89e3\u6790\u4e86\u3002<br><br><br>\u5728\u7b2c\u4e09\u90e8\u5206\u7684\u8bb2\u89e3\u4e2d\uff0c\u6211\u4eec\u77e5\u9053Actor\u548cCritic\u6a21\u578b\u90fd\u4f1a\u505a\u53c2\u6570\u66f4\u65b0\uff0c\u6240\u4ee5\u6211\u4eec\u7684loss\u4e5f\u5206\u62102\u4e2a\uff1a<\/p>\n\n\n\n<ul><li><strong>Actor loss\uff1a<\/strong>\u7528\u4e8e\u8bc4\u4f30Actor\u662f\u5426\u4ea7\u751f\u4e86\u7b26\u5408\u4eba\u7c7b\u559c\u597d\u7684\u7ed3\u679c\uff0c\u5c06\u4f5c\u7528\u4e8eActor\u7684BWD\u4e0a\u3002<\/li><li><strong>Critic loss\uff1a<\/strong>\u7528\u4e8e\u8bc4\u4f30Critic\u662f\u5426\u6b63\u786e\u9884\u6d4b\u4e86\u4eba\u7c7b\u7684\u559c\u597d\uff0c\u5c06\u4f5c\u7528\u4e8eCritic\u7684BWD\u4e0a\u3002<\/li><\/ul>\n\n\n\n<p>\u6211\u4eec\u8be6\u7ec6\u6765\u770b\u8fd9\u4e24\u8005\u3002<\/p>\n\n\n\n<h4 id=\"h_677607581_10\">Actor loss<\/h4>\n\n\n\n<p id=\"h_677607581_11\"><strong>\uff081\uff09\u76f4\u89c2\u8bbe\u8ba1<\/strong><\/p>\n\n\n\n<p>\u6211\u4eec\u5148\u6765\u770b\u4e00\u4e2a\u76f4\u89c2\u7684loss\u8bbe\u8ba1\u65b9\u5f0f\uff1a<\/p>\n\n\n\n<ul><li>Actor\u63a5\u6536\u5230\u5f53\u524d\u4e0a\u6587&nbsp;St&nbsp;\uff0c\u4ea7\u51fatoken&nbsp;At&nbsp;\uff08&nbsp;P(At|St)&nbsp;\uff09<\/li><li>Critic\u6839\u636e&nbsp;St,At&nbsp;\uff0c\u4ea7\u51fa\u5bf9\u603b\u6536\u76ca\u7684\u9884\u6d4b&nbsp;<strong>Vt<\/strong><\/li><li>\u90a3\u4e48Actor loss\u53ef\u4ee5\u8bbe\u8ba1\u4e3a\uff1a&nbsp;<\/li><\/ul>\n\n\n\n<div class=\"wp-block-image\"><figure class=\"aligncenter size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-115.png\" alt=\"\" class=\"wp-image-25365\" width=\"437\" height=\"26\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-115.png 562w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-115-300x18.png 300w\" sizes=\"(max-width: 437px) 100vw, 437px\" \/><\/figure><\/div>\n\n\n\n<p><strong>\u6c42\u548c\u7b26\u53f7\u8868\u793a\u6211\u4eec\u53ea\u8003\u8651response\u90e8\u5206\u6240\u6709token\u7684loss<\/strong>\uff0c\u4e3a\u4e86\u8868\u8fbe\u7b80\u4fbf\uff0c\u6211\u4eec\u5148\u628a\u8fd9\u4e2a\u6c42\u548c\u7b26\u53f7\u7565\u53bb\uff08\u4e0b\u6587\u4e5f\u662f\u540c\u7406\uff09\uff0c\u4e5f\u5c31\u662f\u8bf4\uff1a<\/p>\n\n\n\n<div class=\"wp-block-image\"><figure class=\"aligncenter size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-116.png\" alt=\"\" class=\"wp-image-25367\" width=\"237\" height=\"29\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-116.png 354w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-116-300x36.png 300w\" sizes=\"(max-width: 237px) 100vw, 237px\" \/><\/figure><\/div>\n\n\n\n<p>\u6211\u4eec\u5e0c\u671bminimize\u8fd9\u4e2aactor_loss\u3002<\/p>\n\n\n\n<p><br><strong>\u8fd9\u4e2a\u8bbe\u8ba1\u7684\u76f4\u89c2\u89e3\u91ca\u662f\uff1a<\/strong><\/p>\n\n\n\n<ul><li><strong>\u5f53&nbsp;Vt&gt;0&nbsp;\u65f6\uff0c\u610f\u5473\u7740Critic\u5bf9Actor\u5f53\u524d\u91c7\u53d6\u7684\u52a8\u4f5c\u7ed9\u4e86\u6b63\u5411\u53cd\u9988\uff0c\u56e0\u6b64\u6211\u4eec\u5c31\u9700\u8981\u5728\u8bad\u7ec3\u8fed\u4ee3\u4e2d\u63d0\u9ad8&nbsp;P(At|St)&nbsp;\uff0c\u8fd9\u6837\u5c31\u80fd\u8fbe\u5230\u51cf\u5c0floss\u7684\u4f5c\u7528\u3002<\/strong><\/li><li><strong>\u5f53&nbsp;Vt&lt;0&nbsp;\u65f6\uff0c\u610f\u5473\u7740Critic\u5bf9Actor\u5f53\u524d\u91c7\u53d6\u7684\u52a8\u4f5c\u7ed9\u4e86\u8d1f\u5411\u53cd\u9988\uff0c\u56e0\u6b64\u6211\u4eec\u5c31\u9700\u8981\u5728\u8bad\u7ec3\u8fed\u4ee3\u4e2d\u964d\u4f4e&nbsp;P(At|St)&nbsp;\uff0c\u8fd9\u6837\u5c31\u80fd\u5230\u8fbe\u5230\u51cf\u5c0floss\u7684\u4f5c\u7528\u3002<\/strong><\/li><\/ul>\n\n\n\n<p><strong>\u4e00\u53e5\u8bdd\u603b\u7ed3\uff1a\u8fd9\u4e2aloss\u8bbe\u8ba1\u7684\u542b\u4e49\u662f\uff0c\u5bf9\u4e0a\u6587<\/strong>&nbsp;St&nbsp;<strong>\u800c\u8a00\uff0c\u5982\u679ctoken<\/strong>&nbsp;At&nbsp;<strong>\u4ea7\u751f\u7684\u6536\u76ca\u8f83\u9ad8\uff0c\u90a3\u5c31\u589e\u5927\u5b83\u51fa\u73b0\u7684\u6982\u7387\uff0c\u5426\u5219\u964d\u4f4e\u5b83\u51fa\u73b0\u7684\u6982\u7387\u3002<\/strong><\/p>\n\n\n\n<p id=\"h_677607581_12\"><strong>\uff082\uff09\u5f15\u5165\u4f18\u52bf\uff08Advantage\uff09<\/strong><\/p>\n\n\n\n<p>\u5728\u5f00\u59cb\u8bb2\u89e3\u4e4b\u524d\uff0c\u6211\u4eec\u4e3e\u4e2a\u5c0f\u4f8b\u5b50\uff1a<br>\u5047\u8bbe\u5728\u738b\u8005\u4e2d\uff0c\u4e2d\u8def\u60f3\u652f\u63f4\u53d1\u80b2\u8def\uff0c\u8fd9\u65f6\u4e2d\u8def\u6709\u4e24\u79cd\u9009\u62e9\uff1a1. \u8d70\u81ea\u5bb6\u91ce\u533a\u30022. \u8d70\u5927\u9f99\u8def\u3002<br>\u4e2d\u8def\u9009\u62e9\u8d70\u5927\u9f99\u8def\uff0c\u5f53\u5979\u505a\u51fa\u8fd9\u4e2a\u51b3\u5b9a\u540e\uff0cCritic\u544a\u8bc9\u5979\u53ef\u4ee5\u65361\u4e2a\u4eba\u5934\u3002\u7ed3\u679c\uff0c\u6b64\u523b\u5bf9\u9762\u6253\u91ce\u6b63\u5728\u81ea\u5bb6\u91c7\u7075\u829d\uff0c\u5bf9\u9762\u4e5f\u6ca1\u6709\u4ec0\u4e48\u82df\u8349\u82f1\u96c4\uff0c\u4e2d\u8def\u4e00\u8def\u76f4\u4e0a\uff0c\u6700\u7ec8\u6536\u52722\u4e2a\u4eba\u5934\u3002<br>\u56e0\u4e3a\u5b9e\u9645\u6536\u5272\u7684\u4eba\u5934\u6bd4\u9884\u671f\u8981\u591a1\u4e2a\uff0c\u4e2d\u8def\u5c1d\u5230\u4e86\u751c\u5934\uff0c\u6240\u4ee5\u5979\u589e\u5927\u4e86\u201c\u652f\u63f4\u53d1\u80b2\u8def\u8d70\u5927\u9f99\u8def\u201d\u7684\u6982\u7387\u3002<br><strong>\u8fd9\u4e2a\u591a\u51fa\u6765\u7684\u201c\u751c\u5934\u201d\uff0c\u5c31\u53eb\u505a\u201c\u4f18\u52bf\u201d(Advantage)\u3002<\/strong><br><br><br><strong>\u5bf9NLP\u4efb\u52a1\u6765\u8bf4\uff0c\u5982\u679cCritic\u5bf9<\/strong>&nbsp;At&nbsp;<strong>\u7684\u603b\u6536\u76ca\u9884\u6d4b\u4e3a<\/strong><em>&nbsp;Vt<\/em>&nbsp;<strong>\uff0c\u4f46\u5b9e\u9645\u6267\u884c<\/strong>&nbsp;<em>At&nbsp;<\/em><strong>\u540e\u7684\u603b\u6536\u76ca\u662f<\/strong><em>&nbsp;Rt+\u03b3\u2217Vt+1<\/em>&nbsp;<strong>\uff0c\u6211\u4eec\u5c31\u5b9a\u4e49\u4f18\u52bf\u4e3a\uff1a<\/strong><\/p>\n\n\n\n<div class=\"wp-block-image\"><figure class=\"aligncenter size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-117.png\" alt=\"\" class=\"wp-image-25372\" width=\"221\" height=\"36\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-117.png 321w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-117-300x49.png 300w\" sizes=\"(max-width: 221px) 100vw, 221px\" \/><\/figure><\/div>\n\n\n\n<p>\u6211\u4eec\u7528&nbsp;Advt&nbsp;\u66ff\u6362\u6389&nbsp;Vt&nbsp;\uff0c\u5219\u6b64\u523bactor_loss\u53d8\u4e3a\uff1a<\/p>\n\n\n\n<div class=\"wp-block-image\"><figure class=\"aligncenter size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-118.png\" alt=\"\" class=\"wp-image-25374\" width=\"260\" height=\"24\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-118.png 393w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-118-300x27.png 300w\" sizes=\"(max-width: 260px) 100vw, 260px\" \/><\/figure><\/div>\n\n\n\n<p id=\"h_677607581_13\"><strong>\uff083\uff09\u91cd\u65b0\u8bbe\u8ba1&nbsp;Rt<\/strong><\/p>\n\n\n\n<p>\u603b\u7ed3\u4e00\u4e0b\uff0c\u5230\u76ee\u524d\u4e3a\u6b62\uff0c\u6211\u4eec\u7684actor_loss\u5f62\u5f0f\u4e3a\uff1a<\/p>\n\n\n\n<div class=\"wp-block-image\"><figure class=\"aligncenter size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-119.png\" alt=\"\" class=\"wp-image-25378\" width=\"259\" height=\"67\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-119.png 405w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-119-300x78.png 300w\" sizes=\"(max-width: 259px) 100vw, 259px\" \/><\/figure><\/div>\n\n\n\n<p>\u540c\u65f6\u6ce8\u610f\uff0c\u8fd9\u4e2aactor_loss\u5e94\u8be5\u662fresponse\u7684\u6240\u6709token loss\u7684sum\u6216\u8005avg\u3002\u8fd9\u91cc\u4e3a\u4e86\u8868\u8fbe\u65b9\u4fbf\uff0c\u6211\u4eec\u7684\u516c\u5f0f\u7565\u53bb\u4e86\u6c42\u548c\u6216\u6c42\u5e73\u5747\u7684\u7b26\u53f7\u3002<\/p>\n\n\n\n<p>\u6309\u7167\u8fd9\u4e2a\u7406\u89e3\uff0c&nbsp;Rt&nbsp;\u5e94\u8be5\u8868\u793a\u6bcf\u4e2aActor\u4ea7\u51fatoken&nbsp;At&nbsp;\u5e26\u6765\u7684\u5373\u65f6\u6536\u76ca\uff0c\u6b63\u5982\u4e0b\u56fe\u6240\u793a\uff08\u5176\u4e2d&nbsp;T&nbsp;\u8868\u793a\u6700\u540e\u4e00\u4e2a\u65f6\u523b\uff09\uff1a<\/p>\n\n\n\n<div class=\"wp-block-image\"><figure class=\"aligncenter size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-120.png\" alt=\"\" class=\"wp-image-25379\" width=\"465\" height=\"368\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-120.png 676w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-120-300x237.png 300w\" sizes=\"(max-width: 465px) 100vw, 465px\" \/><\/figure><\/div>\n\n\n\n<p>\u4f46\u5728deepspeed-chat\u7684RLHF\u5b9e\u8df5\u4e2d\uff0c\u5bf9&nbsp;Rt&nbsp;\u505a\u4e86\u53e6\u4e00\u79cd\u8bbe\u8ba1\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-121.png\" alt=\"\" class=\"wp-image-25380\" width=\"580\" height=\"222\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-121.png 946w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-121-300x115.png 300w\" sizes=\"(max-width: 580px) 100vw, 580px\" \/><\/figure>\n\n\n\n<p><strong>\u57fa\u4e8e\u8fd9\u4e9b\uff0c\u4e0a\u9762\u8fd9\u4e2a\u5bf9<\/strong>&nbsp;Rt&nbsp;<strong>\u7684\u8bbe\u8ba1\u53ef\u7406\u89e3\u6210\uff1a<\/strong><\/p>\n\n\n\n<ul><li><strong>\u5f53<\/strong>t\u2260T<strong>\u65f6\uff0c\u6211\u4eec\u66f4\u52a0\u5173\u5fc3Actor\u662f\u5426\u6709\u5728Ref\u7684\u7ea6\u675f\u4e0b\u751f\u4ea7token&nbsp;<\/strong>At<\/li><li><strong>\u5f53<\/strong>$&nbsp;t=T<strong>\u65f6\uff0c\u6211\u4eec\u4e0d\u4ec5\u5173\u5fc3Actor\u662f\u5426\u9075\u4ece\u4e86Ref\u7684\u7ea6\u675f\uff0c\u4e5f\u5173\u5fc3\u771f\u6b63\u7684\u5373\u65f6\u6536\u76ca<\/strong>Rt<\/li><\/ul>\n\n\n\n<p><br><strong>\u4e3a\u4ec0\u4e48\u53ea\u6709\u6700\u540e\u4e00\u4e2a\u65f6\u523b\u7684&nbsp;Rt&nbsp;\u88ab\u7eb3\u5165\u4e86\u8003\u91cf\u5462\uff1f\u8fd9\u662f\u56e0\u4e3a\u5728Reward\u6a21\u578b\u8bad\u7ec3\u9636\u6bb5\uff0c\u5c31\u662f\u7528\u8fd9\u4e2a\u4f4d\u7f6e\u7684&nbsp;Rt&nbsp;\u6765\u8868\u793a\u5bf9\u5b8c\u6574\u7684prompt + response\u7684\u5956\u52b1\u9884\u6d4b<\/strong>\uff08\u4f46\u4e0d\u59a8\u788d\u4f60\u7406\u89e3\u6210\u662f\u6267\u884c\u5b8c&nbsp;AT&nbsp;\u7684\u5373\u65f6\u5956\u52b1\uff09\uff0c\u7136\u540e\u7528\u8fd9\u4e2a\u6307\u6807\u6765\u505a\u6a21\u578beval\u7684\uff08\u4f46\u662fReward\u8bad\u7ec3\u9636\u6bb5\u7b97loss\u65f6\uff0c\u8fd8\u662f\u8003\u8651\u4e86response\u90e8\u5206\u6240\u6709token\u8f93\u51fa\u7684reward\u503c\uff09\u3002\u6240\u4ee5\u5230\u4e86RLHF\u7684\u573a\u666f\u4e0b\uff0c\u5176\u4f59\u65f6\u523b\u7684\u5373\u65f6\u5956\u52b1\uff0c\u6211\u4eec\u5c31\u7528\u201cActor\u662f\u5426\u9075\u5faa\u4e86Ref\u7684\u7ea6\u675f\u201d\u6765\u8fdb\u884c\u8bc4\u4ef7\u3002<br><br><br>\u9700\u8981\u6ce8\u610f\u7684\u662f\uff0c&nbsp;Rt&nbsp;\u7684\u8bbe\u8ba1\u5e76\u4e0d\u53ea\u6709\u8fd9\u4e00\u79cd\u3002deepspeed\u5728\u81ea\u5df1\u7684\u4ee3\u7801\u6ce8\u91ca\u4e2d\u4e5f\u6709\u63d0\u8fc7\uff0c\u53ef\u4ee5\u5c1d\u8bd5\u628a\u6700\u540e\u4e00\u4e2a\u65f6\u523b\u7684&nbsp;RT&nbsp;\u66ff\u6362\u6210\u6240\u6709token\u7684\u5373\u65f6\u5956\u52b1\u7684\u5e73\u5747\u503c\u3002\u5982\u679c\u7ad9\u5728\u8fd9\u4e2a\u89d2\u5ea6\u7406\u89e3\u7684\u8bdd\uff0c\u6211\u4eec\u540c\u6837\u4e5f\u53ef\u4ee5\u5c1d\u8bd5\u5728\u6bcf\u4e00\u4e2a\u4f4d\u7f6e\u7684\u5956\u52b1\u8861\u91cf\u4e0a\u5f15\u5165&nbsp;Rt&nbsp;\u3002<\/p>\n\n\n\n<p>\u4ee3\u7801\u5b9e\u8df5\u5982\u4e0b\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>def compute_rewards(self, prompts, log_probs, ref_log_probs, reward_score,\n                        action_mask):\n        \"\"\"\n        reward_function\uff1a\u8ba1\u7b97\u6700\u7ec8\u7684reward\u5206\u6570\n        \u590d\u4e60\u4e00\u4e0b\u51e0\u4e2a\u76f8\u5173\u53c2\u6570\u7684\u9ed8\u8ba4\u503c\uff1a\n        self.kl_ctl = 0.1\n        self.clip_reward_value = 5\n        \n        \u5bf9\u4e8ebatch\u4e2d\u7684\u67d0\u4e2aprompt\u6765\u8bf4\uff0c\u5b83\u6700\u7ec8\u7684reward\u5206\u6570\u4e3a\uff1a\n        (1) \u5148\u8ba1\u7b97actor\u548cref_model\u7684logit\u76f8\u4f3c\u5ea6\uff1a -self.kl_ctl * (log_probs - ref_log_probs)\n            \u5176\u5b9e\u5199\u6210self.kl_ctl * (ref_log_probs - log_probs)\u66f4\u597d\u7406\u89e3\u4e9b\n            \u8fd9\u4e2a\u503c\u8d8a\u5927\uff0c\u8bf4\u660eref_model\u5bf9actor\u751f\u6210\u7684\u7ed3\u679c\u7684\u8ba4\u53ef\u5ea6\u8d8a\u9ad8\uff08\u5373\u8868\u660erlhf\u6ca1\u6709\u8bad\u6b6a\uff09\uff0c\n            \u6ca1\u6709\u8bad\u6b6a\u7684\u60c5\u51b5\u4e0b\u6211\u4eec\u4e5f\u5e94\u8be5\u7ed9\u6a21\u578b\u4e00\u4e9b\u5956\u52b1\uff0c\u8fd9\u4e2a\u5956\u52b1\u5c31\u662fself.kl_ctl * (ref_log_probs - log_probs)\n            \n        \uff082\uff09\u7531\u4e8e\u6211\u4eec\u53ea\u53d6\u6700\u540e\u4e00\u4e2atoken\u5bf9\u5e94\u4f4d\u7f6e\u7684\u5206\u6570\u4f5c\u4e3areward_score\uff0c\u56e0\u6b64\u6211\u4eec\u53ea\u9700\u8981\uff1a\n            self.kl_ctl * (ref_log_probs - log_probs)\u7684\u6700\u540e\u4e00\u4f4d + reward_score\n         \n         (3) \u540c\u65f6\u6211\u4eec\u5bf9reward_score\u4e5f\u505a\u4e86\u5927\u5c0f\u9650\u5236\uff0c\u6700\u5927\u4e0d\u8d85\u8fc7self.clip_reward_value\uff08\u8d85\u8fc7\u7edf\u4e00\u7ed9\u6210self.clip_reward_value\uff09\uff0c\n             \u6700\u5c0f\u4e0d\u4f4e\u4e8e-self.clip_reward_value\uff08\u4f4e\u4e8e\u7edf\u4e00\u7ed9\u6210-self.clip_reward_value\uff09\n        \n         (4) \u6700\u540e\u8fd4\u56de\u7684rewards\u5927\u5c0f\u4e3a\uff1a\uff08batch_size, \u5404\u6761\u6570\u636e\u7684\u957f\u5ea6\uff09\uff0c\u5bf9batch\u4e2d\u7684\u6bcf\u6761\u6570\u636e\u6765\u8bf4\uff1a\n             - response\u7684\u6700\u540e\u4e00\u4f4d\uff1aself.kl_ctl * (ref_log_probs - log_probs)\u7684\u6700\u540e\u4e00\u4f4d + reward_score\n             - response\u7684\u5176\u4f59\u4f4d\u7f6e\uff1aself.kl_ctl * (ref_log_probs - log_probs)\n        \n        \"\"\"\n\n        kl_divergence_estimate = -self.kl_ctl * (log_probs - ref_log_probs)\n        rewards = kl_divergence_estimate\n        <em># ---------------------------------------------------------------------------------------------------<\/em>\n        <em># response\u5f00\u59cb\u7684\u4f4d\u7f6e<\/em>\n        <em># \uff08\u56e0\u4e3a\u6211\u4eec\u5bf9prompt\u505a\u8fc7padding\u5904\u7406\uff0c\u56e0\u6b64batch\u4e2d\u6bcf\u4e2aprompt\u957f\u5ea6\u4e00\u81f4\uff0c\u4e5f\u5c31\u610f\u5473\u7740\u6bcf\u4e2aresponse\u5f00\u59cb\u7684\u4f4d\u7f6e\u4e00\u81f4\uff09<\/em>\n        <em># \uff08\u6240\u4ee5\u8fd9\u91ccstart\u662f\u4e0d\u52a0s\u7684\uff0c\u53ea\u662f\u4e00\u4e2aint\uff09<\/em>\n        <em># ---------------------------------------------------------------------------------------------------<\/em>\n        start = prompts.shape&#091;1] - 1\n        <em># ---------------------------------------------------------------------------------------------------<\/em>\n        <em># response\u7ed3\u675f\u7684\u4f4d\u7f6e<\/em>\n        <em># \uff08\u56e0\u4e3a\u4e00\u4e2abatch\u4e2d\uff0c\u6bcf\u4e2aresponse\u7684\u957f\u5ea6\u4e0d\u4e00\u6837\uff0c\u6240\u4ee5response\u7684\u7ed3\u675f\u4f4d\u7f6e\u4e5f\u4e0d\u4e00\u6837\uff09<\/em>\n        <em># \uff08\u6240\u4ee5\u8fd9\u91ccend\u662f\u52a0s\u7684\uff0cends\u7684\u5c3a\u5bf8\u662f(batch_size,)<\/em>\n        <em># ---------------------------------------------------------------------------------------------------<\/em>\n        ends = start + action_mask&#091;:, start:].sum(1) + 1\n        <em># ---------------------------------------------------------------------------------------------------<\/em>\n        <em># \u5bf9rewards_score\u505a\u9650\u5236<\/em>\n        <em># ---------------------------------------------------------------------------------------------------<\/em>\n        reward_clip = torch.clamp(reward_score, -self.clip_reward_value,\n                                  self.clip_reward_value)\n        batch_size = log_probs.shape&#091;0]\n        for j in range(batch_size):\n            rewards&#091;j, start:ends&#091;j]]&#091;-1] += reward_clip&#091;j] <em># <\/em>\n\n        return rewards\n\n\u6ce8\u610f\u8f93\u5165\u8f93\u51fa\u7684\u7ef4\u5ea6\uff0cprompts \u662f\u4e00\u4e2a &#091;batch size, padded prompt length] \u7684 matrix\uff0cref_log_probs \u548c log_probs \u662f &#091;batch size, padded prompt with response length] \u5927\u5c0f\u7684\u77e9\u9635\uff0c\u7136\u540e\u53ea\u6709\u4ece prompt \u7ed3\u675f\u5230 response \u7ed3\u675f\u8fd9\u4e00\u5757\u513f\u7684 reward \u624d\u4f1a\u5b9e\u9645\u6709\u4f5c\u7528\uff0cprompt \u7684 reward \u662f\u4e0d\u8ba1\u7b97\u7684\u3002\n\nprompt \u6709\u7edf\u4e00\u7684 padding\uff0c\u6240\u4ee5 response \u7684 start \u4f4d\u7f6e\u662f\u552f\u4e00\u7684\uff0c\u800c ends \u5219\u901a\u8fc7 action_mask \u4e2d\u7684 1 \u5143\u7d20\u7684\u622a\u6b62\u4e3a\u6b62\u8ba1\u7b97\u5f97\u5230\u3002\u6700\u540e\uff0c\u5728\u8fd9\u4e2a batch \u4e2d\uff0c\u6bcf\u4e2a prompt \u7684 reward \u7684\u7ed3\u5c3e\u90a3\u4e2a token \u52a0\u4e0a reward_score \u8fdb\u8fc7 clip \u5f97\u5230\u7684 reward\u3002<\/code><\/pre>\n\n\n\n<p><strong>\uff084\uff09\u91cd\u65b0\u8bbe\u8ba1\u4f18\u52bf<\/strong><\/p>\n\n\n\n<p>\u597d\uff0c\u518d\u603b\u7ed3\u4e00\u4e0b\uff0c\u76ee\u524d\u4e3a\u6b62\u6211\u4eec\u7684actor_loss\u4e3a\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-122.png\" alt=\"\" class=\"wp-image-25385\" width=\"265\" height=\"92\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-122.png 393w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-122-300x104.png 300w\" sizes=\"(max-width: 265px) 100vw, 265px\" \/><\/figure>\n\n\n\n<p>\u540c\u65f6\uff0c\u6211\u4eec\u5bf9&nbsp;Rt&nbsp;\u8fdb\u884c\u6765\u6539\u9020\uff0c\u4f7f\u5176\u80fd\u591f\u8861\u91cfActor\u6a21\u578b\u662f\u5426\u9075\u4ece\u4e86Ref\u6a21\u578b\u7684\u7ea6\u675f\u3002<br>\u73b0\u5728\u6211\u4eec\u628a\u6539\u9020\u7126\u70b9\u653e\u5728&nbsp;Advt&nbsp;\u4e0a\uff0c\u56de\u60f3\u4e00\u4e0b\uff0c\u65e2\u7136\u5bf9\u4e8e\u6536\u76ca\u800c\u8a00\uff0c\u5206\u4e3a\u5373\u65f6\u548c\u672a\u6765\uff0c\u90a3\u4e48\u5bf9\u4e8e\u4f18\u52bf\u800c\u8a00\uff0c\u662f\u4e0d\u662f\u4e5f\u80fd\u5f15\u5165\u5bf9\u672a\u6765\u4f18\u52bf\u7684\u8003\u91cf\u5462\uff1f\u8fd9\u6837\uff0c\u6211\u4eec\u5c31\u53ef\u4ee5\u628a&nbsp;Advt&nbsp;\u6539\u5199\u6210\u5982\u4e0b\u5f62\u5f0f\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-123.png\" alt=\"\" class=\"wp-image-25386\" width=\"400\" height=\"40\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-123.png 550w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-123-300x30.png 300w\" sizes=\"(max-width: 400px) 100vw, 400px\" \/><\/figure>\n\n\n\n<p><br>\uff08\u719f\u6089\u5f3a\u5316\u5b66\u4e60\u7684\u670b\u53cb\u5e94\u8be5\u80fd\u4e00\u773c\u770b\u51fa\u8fd9\u662fGAE\uff0c\u8fd9\u91cc\u6211\u4eec\u4e0d\u6253\u7b97\u505a\u590d\u6742\u7684\u4ecb\u7ecd\uff0c\u4e00\u5207\u90fd\u7ad9\u5728\u76f4\u89c9\u7684\u89d2\u5ea6\u7406\u89e3\uff09<br><strong>\u5176\u4e2d\uff0c\u65b0\u5f15\u5165\u7684<\/strong>&nbsp;\u03bb&nbsp;<strong>\u4e5f\u662f\u4e00\u4e2a\u5e38\u91cf\uff0c\u53ef\u5c06\u5176\u7406\u89e3\u4e3a\u6743\u8861\u56e0\u5b50\uff0c\u76f4\u89c9\u4e0a\u770b\u5b83\u63a7\u5236\u4e86\u5728\u8ba1\u7b97\u5f53\u524d\u4f18\u52bf\u65f6\u5bf9\u672a\u6765\u4f18\u52bf\u7684\u8003\u91cf\u3002\uff08\u4ece\u5f3a\u5316\u5b66\u4e60\u7684\u89d2\u5ea6\u4e0a\uff0c\u5b83\u63a7\u5236\u4e86\u4f18\u52bf\u4f30\u8ba1\u7684\u65b9\u5dee\u548c\u504f\u5dee\uff09<\/strong><\/p>\n\n\n\n<p><strong>\u770b\u5230\u8fd9\u91cc\uff0c\u4f60\u53ef\u80fd\u60f3\u95ee\uff1a\u8fd9\u4e2a\u4ee3\u8868\u672a\u6765\u4f18\u52bf\u7684<\/strong>&nbsp;Advt+1&nbsp;<strong>\uff0c\u6211\u8981\u600e\u4e48\u7b97\u5462\uff1f<\/strong><br>\u6ce8\u610f\u5230\uff0c\u5bf9\u4e8e\u6700\u540e\u4e00\u4e2a\u65f6\u523b&nbsp;t&nbsp;\uff0c\u5b83\u7684\u672a\u6765\u6536\u76ca\uff08&nbsp;VT+1&nbsp;\uff09\u548c\u672a\u6765\u4f18\u52bf\uff08&nbsp;AdvT+1&nbsp;\uff09\u90fd\u662f0\uff0c\u4e5f\u5c31\u662f&nbsp;<em>Adv<sub>T<\/sub>=R<sub>T<\/sub>\u2212V<sub>T<\/sub><\/em>&nbsp;\uff0c\u8fd9\u662f\u53ef\u4ee5\u76f4\u63a5\u7b97\u51fa\u6765\u7684\u3002<strong>\u800c\u6709\u4e86<\/strong>&nbsp;<em>Adv<sub>T<\/sub><\/em>&nbsp;<strong>\uff0c\u6211\u4eec\u4e0d\u5c31\u80fd\u4ece\u540e\u5f80\u524d\uff0c\u901a\u8fc7\u52a8\u6001\u89c4\u5212\u7684\u65b9\u6cd5\uff0c\u628a\u6240\u6709\u65f6\u523b\u7684\u4f18\u52bf\u90fd\u4f9d\u6b21\u7b97\u51fa\u6765\u4e86\u5417\uff1f<\/strong><\/p>\n\n\n\n<p>\u4ee3\u7801\u5b9e\u8df5\u5982\u4e0b\uff08\u5176\u4e2d\u8fd4\u56de\u503c\u4e2d\u7684returns\u8868\u793a\u5b9e\u9645\u6536\u76ca\uff0c\u5c06\u88ab\u7528\u4e8e\u8ba1\u7b97Critic\u6a21\u578b\u7684loss\uff0c\u53ef\u4ee5\u53c2\u89c14.2\uff0c\u5176\u4f59\u7ec6\u8282\u90fd\u5728\u4ee3\u7801\u6ce8\u91ca\u4e2d\uff09\uff1a\u6ce8\u610f\u8fd9\u4e2a\u51fd\u6570\u4e00\u5e76\u8fd4\u56de\u4e86&nbsp;<code>returns<\/code>\uff0c\u4e5f\u5373\u6bcf\u4e2a token \u7684\u5b9e\u9645\u6536\u76ca\uff0c\u8fd9\u4e2a\u6536\u76ca\u4e4b\u540e\u4f1a\u7528\u4e8e\u66f4\u65b0 critic model\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code> def get_advantages_and_returns(self, values, rewards, start):\n        \"\"\"\n        Adopted from https:\/\/github.com\/CarperAI\/trlx\/blob\/main\/trlx\/models\/modeling_ppo.py#L134\n        \n        \u6ca1\u6709\u5f15\u5165GAE\u524d\u7684t\u65f6\u523b\u7684\u4f18\u52bf\u503c\uff1a\n        detal_t = r_t + gamma * V_t+1 - V_t\n        \u5176\u4e2d\uff1a\n            - r_t\u8868\u793at\u65f6\u523b\u7684\u5373\u65f6\u6536\u76ca\n            - V_t+1\u8868\u793a\u672a\u6765\u65f6\u523b\u7684\u9884\u671f\u6536\u76ca\n            - r_t + gamma * V_t+1\u53ef\u7406\u89e3\u6210t\u65f6\u523b\u7684\u5b9e\u9645\u9884\u671f\u6536\u76ca\n            - V_t\u53ef\u7406\u89e3\u6210t\u65f6\u523b\u7684\u9884\u4f30\u9884\u671f\u6536\u76ca\uff08\u662f\u6a21\u578b\uff0c\u4f8b\u5982critic model\u81ea\u5df1\u4f30\u7b97\u51fa\u6765\u7684\uff09\n        \n        \u5f15\u5165GAE\u540e\u7684t\u65f6\u523b\u7684\u4f18\u52bf\u503c\uff1a\n        A_t = delta_t + gamma * lambda * A_t+1\n        \u7c97\u66b4\u7406\u89e3\u4e3a\u5728t\u65f6\u523b\u65f6\uff0c\u4e0d\u4ec5\u8003\u8651\u5f53\u4e0b\u4f18\u52bf\uff0c\u8fd8\u8003\u8651\u4e86\u672a\u6765\u7684\u4f18\u52bf\n        \u4e3a\u4e86\u77e5\u9053A_t, \u6211\u4eec\u5f97\u77e5\u9053A_t+1\uff0c\u6240\u4ee5\u5728\u672c\u7b97\u6cd5\u4e2d\u91c7\u53d6\u4e86\u4ece\u540e\u5f80\u524d\u505a\u52a8\u6001\u89c4\u5212\u6c42\u89e3\u7684\u65b9\u6cd5\uff0c\u4e5f\u5373\uff1a\n        \u5047\u8bbeT\u662f\u6700\u540e\u4e00\u4e2a\u65f6\u523b\uff0c\u5219\u6709A_T+1 = 0, \u6240\u4ee5\u6709: A_T = delta_T\n        \u77e5\u9053\u4e86A_T, \u5c31\u53ef\u4ee5\u4f9d\u6b21\u5f80\u524d\u5012\u63a8\uff0c\u628aA_t-1, A_t-2\u4e4b\u7c7b\u90fd\u7b97\u51fa\u6765\u4e86\n        \n        \u5f15\u5165GAE\u540et\u65f6\u523b\u7684\u5b9e\u9645\u9884\u671f\u6536\u76ca\n        returns_t = A_t + V_t\n                  = delta_t + gamma * lambda * A_t+1 + V_t\n                  = r_t + gamma * V_t+1 - V_t + gamma * lambda * A_t+1 + V_t\n                  = r_t + gamma * (V_t+1 + lambda * A_t+1)\n        \n        \u6ce8\u610f\uff0c\u8fd9\u91cc\u4e0d\u7ba1\u662fadvantages\u8fd8\u662freturns\uff0c\u90fd\u53ea\u7b97response\u7684\u90e8\u5206\n        \"\"\"\n        \n        <em># Adopted from https:\/\/github.com\/CarperAI\/trlx\/blob\/main\/trlx\/models\/modeling_ppo.py#L134<\/em>\n        lastgaelam = 0\n        advantages_reversed = &#091;]\n        length = rewards.size()&#091;-1]\n        <em># \u6ce8\u610f\u8fd9\u91cc\u7528\u4e86reversed\uff0c\u662f\u91c7\u53d6\u4ece\u540e\u5f80\u524d\u5012\u63a8\u8ba1\u7b97\u7684\u65b9\u5f0f<\/em>\n        for t in reversed(range(start, length)):\n            nextvalues = values&#091;:, t + 1] if t &lt; length - 1 else 0.0\n            delta = rewards&#091;:, t] + self.gamma * nextvalues - values&#091;:, t]\n            lastgaelam = delta + self.gamma * self.lam * lastgaelam\n            advantages_reversed.append(lastgaelam)\n        advantages = torch.stack(advantages_reversed&#091;::-1], dim=1) <em># \u4f18\u52bf<\/em>\n        returns = advantages + values&#091;:, start:] <em># \u5b9e\u9645\u6536\u76ca<\/em>\n        <em># values: \u9884\u671f\u6536\u76ca<\/em>\n        return advantages.detach(), returns<\/code><\/pre>\n\n\n\n<p id=\"h_677607581_15\"><strong>\uff085\uff09PPO-epoch: \u5f15\u5165\u65b0\u7ea6\u675f<\/strong><\/p>\n\n\n\n<p>\u603b\u7ed3\u4e00\u4e0b\uff0c\u76ee\u524d\u4e3a\u6b62\u6211\u4eec\u7684actor_loss\u4e3a\uff1a<\/p>\n\n\n\n<div class=\"wp-block-image\"><figure class=\"aligncenter size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-124.png\" alt=\"\" class=\"wp-image-25394\" width=\"443\" height=\"78\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-124.png 595w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-124-300x53.png 300w\" sizes=\"(max-width: 443px) 100vw, 443px\" \/><\/figure><\/div>\n\n\n\n<p>\u540c\u65f6<\/p>\n\n\n\n<ul><li><strong>\u6211\u4eec\u5df2\u7ecf\u5bf9<\/strong><em>Rt<\/em><strong>\u8fdb\u884c\u6765\u6539\u9020\uff0c\u4f7f\u5176\u80fd\u591f\u8861\u91cfActor\u6a21\u578b\u662f\u5426\u9075\u4ece\u4e86Ref\u6a21\u578b\u7684\u7ea6\u675f\u3002<\/strong><\/li><li><strong>\u6211\u4eec\u5df2\u7ecf\u5bf9<\/strong><em>Advt<\/em><strong>\u8fdb\u884c\u6539\u9020\uff0c\u4f7f\u5176\u4e0d\u4ec5\u8003\u8651\u4e86\u5f53\u524d\u65f6\u523b\u7684\u4f18\u52bf\uff0c\u8fd8\u8003\u8651\u4e86\u672a\u6765\u7684\u4f18\u52bf<\/strong><\/li><\/ul>\n\n\n\n<p>\u57fa\u4e8e\u8fd9\u4e9b\u6539\u9020\uff0c\u6211\u4eec\u91cd\u65b0\u7406\u4e00\u904dRLHF-PPO\u7684\u8bad\u7ec3\u8fc7\u7a0b\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"898\" height=\"387\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-125.png\" alt=\"\" class=\"wp-image-25398\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-125.png 898w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-125-300x129.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-125-768x331.png 768w\" sizes=\"(max-width: 898px) 100vw, 898px\" \/><\/figure>\n\n\n\n<ul><li>\u7b2c\u4e00\u6b65\uff0c\u6211\u4eec\u51c6\u5907\u4e00\u4e2abatch\u7684prompts<\/li><li>\u7b2c\u4e8c\u6b65\uff0c\u6211\u4eec\u5c06\u8fd9\u4e2abatch\u7684prompts\u5582\u7ed9Actor\u6a21\u578b\uff0c\u8ba9\u5b83\u751f\u6210\u5bf9\u5e94\u7684responses<\/li><li>\u7b2c\u4e09\u6b65\uff0c\u6211\u4eec\u628aprompt+responses\u5582\u7ed9\u6211\u4eec\u7684Critic\/Reward\/Reference\u6a21\u578b\uff0c\u8ba9\u5b83\u751f\u6210\u7528\u4e8e\u8ba1\u7b97actor\/critic loss\u7684\u6570\u636e\uff0c\u6309\u7167\u5f3a\u5316\u5b66\u4e60\u7684\u672f\u8bed\uff0c\u6211\u4eec\u79f0\u8fd9\u4e9b\u6570\u636e\u4e3a\u7ecf\u9a8c\uff08experiences\uff09\u3002critic loss\u6211\u4eec\u5c06\u5728\u540e\u6587\u505a\u8be6\u7ec6\u8bb2\u89e3\uff0c\u76ee\u524d\u6211\u4eec\u53ea\u628a\u76ee\u5149\u805a\u7126\u5230actor loss\u4e0a<\/li><li>\u7b2c\u56db\u6b65\uff0c\u6211\u4eec\u6839\u636e\u8fd9\u4e9b\u7ecf\u9a8c\uff0c\u5b9e\u9645\u8ba1\u7b97\u51faactor\/critic loss\uff0c\u7136\u540e\u66f4\u65b0Actor\u548cCritic\u6a21\u578b<\/li><\/ul>\n\n\n\n<p>\u8fd9\u4e9b\u6b65\u9aa4\u90fd\u5f88\u7b26\u5408\u76f4\u89c9\uff0c\u4f46\u662f\u7ec6\u5fc3\u7684\u4f60\u80af\u5b9a\u53d1\u73b0\u4e86\uff0c<strong>\u6587\u5b57\u63cf\u8ff0\u4e2d\u7684\u7b2c\u56db\u6b65\u548c\u56fe\u4f8b\u4e2d\u7684\u7b2c\u56db\u6b65\u6709\u5dee\u5f02\uff1a\u56fe\u4e2d\u8bf4\uff0c\u8fd9\u4e00\u4e2abatch\u7684\u7ecf\u9a8c\u503c\u5c06\u88ab\u7528\u4e8en\u6b21\u6a21\u578b\u66f4\u65b0\uff0c\u8fd9\u662f\u4ec0\u4e48\u610f\u601d\u5462\uff1f<\/strong><br><br>\u6211\u4eec\u77e5\u9053\uff0c<strong>\u5728\u5f3a\u5316\u5b66\u4e60\u4e2d\uff0c\u6536\u96c6\u4e00\u4e2abatch\u7684\u7ecf\u9a8c\u662f\u975e\u5e38\u8017\u65f6\u7684\u3002\u5bf9\u5e94\u5230\u6211\u4eecRLHF\u7684\u4f8b\u5b50\u4e2d\uff0c\u6536\u96c6\u4e00\u6b21\u7ecf\u9a8c\uff0c\u5b83\u8981\u7b49\u56db\u4e2a\u6a21\u578b\u505a\u5b8c\u63a8\u7406\u624d\u53ef\u4ee5<\/strong>\uff0c\u6b63\u662f\u56e0\u6b64\uff0c<strong>\u4e00\u4e2abatch\u7684\u7ecf\u9a8c\uff0c\u53ea\u7528\u4e8e\u8ba1\u7b971\u6b21loss\uff0c\u66f4\u65b01\u6b21Actor\u548cCritic\u6a21\u578b\uff0c\u597d\u50cf\u6709\u70b9\u592a\u6d6a\u8d39\u4e86<\/strong>\u3002<br><br>\u6240\u4ee5\uff0c<strong>\u6211\u4eec\u81ea\u7136\u800c\u7136\u60f3\u5230\uff0c1\u4e2abatch\u7684\u7ecf\u9a8c\uff0c\u80fd\u4e0d\u80fd\u7528\u6765\u8ba1\u7b97ppo-epochs\u6b21loss\uff0c\u66f4\u65b0ppo-epochs\u6b21Actor\u548cCritic\u6a21\u578b\uff1f<\/strong>\u7b80\u5355\u5199\u4e00\u4e0b\u4f2a\u4ee3\u7801\uff0c\u6211\u4eec\u60f3\u8981\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code><em># --------------------------------------------------------------<\/em>\n<em># \u521d\u59cb\u5316RLHF\u4e2d\u7684\u56db\u4e2a\u6a21\u578b<\/em>\n<em># --------------------------------------------------------------<\/em>\nactor, critic, reward, ref = initialize_models()\n\n<em># --------------------------------------------------------------<\/em>\n<em># \u8bad\u7ec3<\/em>\n<em># --------------------------------------------------------------<\/em>\n<em># \u5bf9\u4e8e\u6bcf\u4e00\u4e2abatch\u7684\u6570\u636e<\/em>\nfor i in steps: \n    <em># \u5148\u6536\u96c6\u7ecf\u9a8c\u503c<\/em>\n    exps = generate_experience(prompts, actor, critic, reward, ref)\n    <em># \u4e00\u4e2abatch\u7684\u7ecf\u9a8c\u503c\u5c06\u88ab\u7528\u4e8e\u8ba1\u7b97ppo_epochs\u6b21loss\uff0c\u66f4\u65b0ppo_epochs\u6b21\u6a21\u578b<\/em>\n    <em># \u8fd9\u4e5f\u610f\u5473\u7740\uff0c\u5f53\u4f60\u8ba1\u7b97\u4e00\u6b21\u65b0loss\u65f6\uff0c\u4f60\u7528\u7684\u662f\u66f4\u65b0\u540e\u7684\u6a21\u578b<\/em>\n    for j in ppo_epochs:\n        actor_loss = cal_actor_loss(exps, actor)\n        critic_loss = cal_critic_loss(exps, critic)\n        \n        actor.backward(actor_loss)\n        actor.step()\n        \n        critc.backward(critic_loss)\n        critic.step()<\/code><\/pre>\n\n\n\n<p><strong>\u800c\u5982\u679c\u6211\u4eec\u60f3\u8ba9\u4e00\u4e2abatch\u7684\u7ecf\u9a8c\u503c\u88ab\u91cd\u590d\u4f7f\u7528ppo_epochs\u6b21\uff0c\u7b49\u4ef7\u4e8e\u6211\u4eec\u60f3\u8981Actor\u5728\u8fd9\u4e2a\u8fc7\u7a0b\u4e2d\uff0c\u6a21\u62df\u548c\u73af\u5883\u4ea4\u4e92ppo_epochs\u6b21\u3002<\/strong>\u4e3e\u4e2a\u4f8b\u5b50\uff1a<\/p>\n\n\n\n<ul><li>\u5982\u679c1\u4e2abatch\u7684\u7ecf\u9a8c\u503c\u53ea\u4f7f\u75281\u6b21\uff0c\u90a3\u4e48\u5728\u672c\u6b21\u66f4\u65b0\u5b8c\u540e\uff0cActor\u5c31\u5403\u65b0\u7684batch\uff0c\u6b63\u5e38\u548c\u73af\u5883\u4ea4\u4e92\uff0c\u4ea7\u51fa\u65b0\u7684\u7ecf\u9a8c\u503c<\/li><li>\u4f46\u5982\u679c1\u4e2abatch\u7684\u7ecf\u9a8c\u503c\u88ab\u4f7f\u7528ppo_epochs\u6b21\uff0c\u5728\u8fd9ppo_epochs\u4e2d\uff0cActor\u662f\u4e0d\u5403\u4efb\u4f55\u65b0\u6570\u636e\uff0c\u4e0d\u505a\u4efb\u4f55\u4ea4\u4e92\u7684\uff0c\u6240\u4ee5\u6211\u4eec\u53ea\u80fd\u8ba9Actor\u201c\u6a21\u62df\u201d\u4e00\u4e0b\u548c\u73af\u5883\u4ea4\u4e92\u7684\u8fc7\u7a0b\uff0c\u5410\u51fa\u4e00\u4e9b\u65b0\u6570\u636e\u51fa\u6765\u3002<\/li><\/ul>\n\n\n\n<p>\u90a3\u600e\u4e48\u8ba9Actor\u6a21\u62df\u5462\uff1f\u5f88\u7b80\u5355\uff0c\u8ba9\u5b83\u89c2\u5bdf\u4e00\u4e0b\u4e4b\u524d\u7684\u6570\u636e\u957f\u4ec0\u4e48\u6837\uff0c\u8ba9\u5b83\u4f9d\u846b\u82a6\u753b\u74e2\uff0c\u4e0d\u5c31\u884c\u4e86\u5417\uff1f<strong>\u6211\u4eec\u5047\u8bbe\u6700\u5f00\u59cb\u5403batch\uff0c\u5410\u51fa\u7ecf\u9a8c\u7684actor\u53eb<\/strong><em>&nbsp;Actor<sub>old<\/sub><\/em>&nbsp;<strong>\uff0c\u800c\u5728\u4f2a\u4ee3\u7801\u4e2d\uff0c\u6bcf\u6b21\u505a\u5b8cppo_epochs\u800c\u66f4\u65b0\u7684actor\u53eb<\/strong>&nbsp;<em>Actor<sub>new<\/sub><\/em>&nbsp;<strong>\uff0c\u90a3\u4e48\u6211\u4eec\u53ea\u8981\u5c3d\u91cf\u4fdd\u8bc1\u6bcf\u6b21\u66f4\u65b0\u540e\u7684<\/strong>&nbsp;<em>Actor<sub>new<\/sub><\/em>&nbsp;<strong>\u80fd\u6a21\u4eff\u6700\u5f00\u59cb\u7684\u90a3\u4e2a<\/strong>&nbsp;<em>Actor<sub>old<\/sub><\/em>&nbsp;<strong>\uff0c\u4e0d\u5c31\u884c\u4e86\u5417\uff1f<\/strong><\/p>\n\n\n\n<p>\u8bf6\uff01\u662f\u4e0d\u662f\u5f88\u773c\u719f\uff01\u4e24\u4e2a\u5206\u5e03\uff0c\u901a\u8fc7\u4ec0\u4e48\u65b9\u6cd5\u8ba9\u5b83\u4eec\u76f8\u8fd1\uff01<strong>\u90a3\u5f53\u7136\u662fKL\u6563\u5ea6<\/strong>\uff01\u6240\u4ee5\uff0c\u518d\u56de\u5230\u6211\u4eec\u7684actor_loss\u4e0a\u6765\uff0c\u5b83\u73b0\u5728\u5c31\u53ef\u88ab\u6539\u8fdb\u6210\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-126.png\" alt=\"\" class=\"wp-image-25401\" width=\"253\" height=\"34\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-126.png 385w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-126-300x40.png 300w\" sizes=\"(max-width: 253px) 100vw, 253px\" \/><\/figure>\n\n\n\n<p>\u6211\u4eec\u518d\u7a0d\u4f5c\u4e00\u4e9b\u6539\u52a8\u5c06log\u53bb\u6389\uff08\u8fd9\u4e2a\u5176\u5b9e\u4e0d\u662f\u201c\u7a0d\u4f5c\u6539\u52a8\u53bb\u6389log\u201d\u7684\u4e8b\uff0c\u662f\u6d89\u53ca\u5230PPO\u4e2d\u91cd\u8981\u6027\u91c7\u6837\u7684\u76f8\u5173\u5185\u5bb9\uff0c\u5927\u5bb6\u6709\u5174\u8da3\u53ef\u4ee5\u53c2\u8003<a rel=\"noreferrer noopener\" href=\"https:\/\/link.zhihu.com\/?target=https%3A\/\/www.cnblogs.com\/xingzheai\/p\/15931681.html\" target=\"_blank\">\u8fd9\u7bc7<\/a>\uff09\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-127.png\" alt=\"\" class=\"wp-image-25402\" width=\"280\" height=\"45\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-127.png 376w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-127-300x49.png 300w\" sizes=\"(max-width: 280px) 100vw, 280px\" \/><\/figure>\n\n\n\n<p>\u5176\u4e2d\uff0c<strong>&nbsp;Pold&nbsp;\u8868\u793a\u771f\u6b63\u5403\u4e86batch\uff0c\u4ea7\u51fa\u7ecf\u9a8c\u503c\u7684Actor<\/strong>\uff1bP\u8868\u793appo_epochs\u4e2d\u5b9e\u65f6\u8fed\u4ee3\u66f4\u65b0\u7684Actor\uff0c\u5b83\u5728\u6a21\u4eff&nbsp;Pold&nbsp;\u7684\u884c\u4e3a\u3002<strong>\u6240\u4ee5\u8fd9\u4e2a\u516c\u5f0f\u4ece\u76f4\u89c9\u4e0a\u4e5f\u53ef\u4ee5\u7406\u89e3\u6210\uff1a\u5728Actor\u60f3\u901a\u8fc7\u6a21\u62df\u4ea4\u4e92\u7684\u65b9\u5f0f\uff0c\u4f7f\u7528\u4e00\u4e2abatch\u7684\u7ecf\u9a8c\u503c\u66f4\u65b0\u81ea\u5df1\u65f6\uff0c\u5b83\u9700\u8981\u6536\u5230\u771f\u6b63\u5403\u5230batch\u7684\u90a3\u4e2a\u65f6\u523b\u7684Actor\u7684\u7ea6\u675f\uff0c\u8fd9\u6837\u624d\u80fd\u5728\u6709\u6548\u5229\u7528batch\uff0c\u63d0\u5347\u8bad\u7ec3\u901f\u5ea6\u7684\u57fa\u7840\u4e0a\uff0c\u4fdd\u6301\u8bad\u7ec3\u7684\u7a33\u5b9a\u3002<\/strong><\/p>\n\n\n\n<p>\u5728 PPO \u5f3a\u5316\u5b66\u4e60\u4e2d\u4f7f\u7528 KL \u6563\u5ea6\uff0c\u662f\u4e3a\u4e86\uff1a<\/p>\n\n\n\n<blockquote class=\"wp-block-quote\"><p><strong>\u8ba9\u5f53\u524d\u7b56\u7565\uff08Actor\uff09\u5728\u66f4\u65b0\u65f6\u4e0d\u8981\u504f\u79bb\u65e7\u7b56\u7565\u592a\u8fdc\uff0c\u4ece\u800c\u4fdd\u8bc1\u7ecf\u9a8c\u6570\u636e\u4f9d\u7136\u6709\u6548\u3001\u8bad\u7ec3\u8fc7\u7a0b\u66f4\u7a33\u5b9a\u3002<\/strong><\/p><\/blockquote>\n\n\n\n<p>\u6362\u53e5\u8bdd\u8bf4\uff1a<\/p>\n\n\n\n<ul><li><code>P_old<\/code> \u662f\u201c\u771f\u6b63\u7ecf\u5386\u8fc7\u73af\u5883\u201d\u7684<\/li><li><code>P<\/code> \u662f\u201c\u540e\u7eed\u66f4\u65b0\u540e\u6a21\u62df\u4ea4\u4e92\u7684\u201d<\/li><li>\u6240\u4ee5\u4f60\u5f97\u8ba9 <code>P<\/code> \u5c3d\u91cf\u6a21\u4eff <code>P_old<\/code>\uff0c\u624d\u80fd\u7ee7\u7eed\u7528\u65e7\u6570\u636e\u53bb\u66f4\u65b0\u6a21\u578b<\/li><li>KL \u6563\u5ea6\uff0c\u5c31\u662f\u8fd9\u4e2a\u201c\u6a21\u4eff\u7a0b\u5ea6\u201d\u7684\u8861\u91cf\u6307\u6807\u548c\u7ea6\u675f\u624b\u6bb5<\/li><\/ul>\n\n\n\n<p>\u4f46\u662f\uff0c\u8c28\u614e\u7684\u4f60\u53ef\u80fd\u6b64\u65f6\u53c8\u6709\u65b0\u7684\u62c5\u5fc3\u4e86\uff1a<strong>\u867d\u7136\u6211\u4eec\u5728\u66f4\u65b0Actor\u7684\u8fc7\u7a0b\u4e2d\u7528<\/strong>&nbsp;Actorold&nbsp;<strong>\u505a\u4e86\u7ea6\u675f\uff0c\u4f46\u5982\u679c<\/strong>&nbsp;Actorold&nbsp;<strong>\u7684\u7ea6\u675f\u80fd\u529b\u4e0d\u591f\uff0c\u6bd4\u5982\u8bf4<\/strong>&nbsp;P(At|St)\/Pold(At|St)&nbsp;<strong>\u8fd8\u662f\u8d85\u51fa\u4e86\u53ef\u63a5\u53d7\u7684\u8303\u56f4\uff0c\u90a3\u600e\u4e48\u529e\uff1f<\/strong><\/p>\n\n\n\n<p>\u5f88\u7b80\u5355\uff0c\u90a3\u5c31<strong>\u526a\u88c1\uff08clip\uff09<\/strong>\u5b83\u5427\uff01<\/p>\n\n\n\n<p>\u6211\u4eec\u7ed9&nbsp;P(At|St)\/Pold(At|St)&nbsp;\u8bbe\u7f6e\u4e00\u4e2a\u8303\u56f4\uff0c\u4f8b\u5982<code>(0.8 ,1.2)<\/code>\uff0c\u4e5f\u5c31\u662f\u5982\u679c\u8fd9\u4e2a\u503c\u4e00\u65e6\u8d85\u8fc71.2\uff0c\u90a3\u5c31\u7edf\u4e00\u53d8\u62101.2\uff1b\u4e00\u65e6\u5c0f\u4e8e0.8\uff0c\u90a3\u5c31\u7edf\u4e00\u53d8\u62100.8\u3002\u8fd9\u6837\u5c31\u80fd\u4fdd\u8bc1&nbsp;Actor&nbsp;\u548c&nbsp;Actorold&nbsp;\u7684\u5206\u5e03\u76f8\u4f3c\u6027\u5728\u6211\u4eec\u7684\u638c\u63a7\u4e4b\u5185\u4e86\u3002\u6b64\u65f6actor_loss\u53d8\u4e3a\uff1a<\/p>\n\n\n\n<div class=\"wp-block-image\"><figure class=\"aligncenter size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-128.png\" alt=\"\" class=\"wp-image-25406\" width=\"533\" height=\"50\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-128.png 801w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-128-300x28.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-128-768x73.png 768w\" sizes=\"(max-width: 533px) 100vw, 533px\" \/><\/figure><\/div>\n\n\n\n<p>\u8fd9\u65f6\u8981\u6ce8\u610f\uff0c\u5982\u679c\u8d85\u8fc7\u53d8\u5316\u8303\u56f4\uff0c\u5c06&nbsp;P(At|St)\/Pold(At|St)&nbsp;\u5f3a\u5236\u8bbe\u5b9a\u4e3a\u4e00\u4e2a\u5e38\u6570\u540e\uff0c\u5c31\u8bf4\u660e\u8fd9\u4e00\u90e8\u5206\u7684loss\u548cActor\u6a21\u578b\u65e0\u5173\u4e86\uff0c\u800c&nbsp;Advt&nbsp;\u8fd9\u9879\u672c\u8eab\u4e5f\u4e0eActor\u65e0\u5173\u3002<strong>\u6240\u4ee5\u76f8\u5f53\u4e8e\uff0c\u5728\u8d85\u8fc7\u7ea6\u675f\u8303\u56f4\u65f6\uff0c\u6211\u4eec\u505c\u6b62\u5bf9Actor\u6a21\u578b\u8fdb\u884c\u66f4\u65b0\u3002<\/strong><\/p>\n\n\n\n<p>\u6574\u4f53\u4ee3\u7801\u5982\u4e0b\uff1a<\/p>\n\n\n\n<p>\u6ce8\u610f\u6700\u540e\u6574\u4e2a response \u6bcf\u4e00\u5904\u7684 loss \u53d6\u5747\u503c\uff0c\u5c31\u662f\u8fd9\u4e2a prompt + response \u7684 actor loss \u4e86<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>    def actor_loss_fn(self, logprobs, old_logprobs, advantages, mask):\n        \"\"\"\n        logprobs: \u5b9e\u65f6\u8ba1\u7b97\u7684\uff0cresponse\u90e8\u5206\u7684prob\uff08\u53ea\u6709\u8fd9\u4e2a\u662f\u968f\u7740actor\u5b9e\u65f6\u66f4\u65b0\u800c\u6539\u53d8\u7684\uff09\n        old_logprobs\uff1a\u8001\u7b56\u7565\u4e2d\uff0cresponse\u90e8\u5206\u7684prob \uff08\u8fd9\u4e2a\u662f\u56fa\u5b9a\u7684\uff0c\u4e0d\u968factor\u5b9e\u65f6\u66f4\u65b0\u800c\u6539\u53d8\uff09\n        advantages\uff1a \u8001\u7b56\u7565\u4e2d\uff0cresponse\u90e8\u5206\u6bcf\u4e2atoken\u5bf9\u5e94\u7684\u4f18\u52bf\uff08\u8fd9\u4e2a\u662f\u56fa\u5b9a\u7684\uff0c\u4e0d\u968factor\u5b9e\u65f6\u66f4\u65b0\u800c\u6539\u53d8\uff09\n        mask\uff1a\u8001\u7b56\u7565\u4e2d\uff0cresponse\u90e8\u5206\u5bf9\u5e94\u7684mask\u60c5\u51b5\u8fd9\u4e2a\u662f\u56fa\u5b9a\u7684\uff0c\u4e0d\u968factor\u5b9e\u65f6\u66f4\u65b0\u800c\u6539\u53d8\uff09\n        \n        \u4e4b\u6240\u4ee5\u8981\u5f15\u5165logprobs\u8ba1\u7b97actor_loss\uff0c\u662f\u56e0\u4e3a\u6211\u4eec\u4e0d\u5e0c\u671b\u7b56\u7565\u6bcf\u6b21\u66f4\u65b0\u7684\u5e45\u5ea6\u592a\u5927\uff0c\u9632\u6b62\u6a21\u578b\u8bad\u6b6a\n        \n        self.cliprange: \u9ed8\u8ba4\u503c\u662f0.2\n        \"\"\"\n        <em>## policy gradient loss<\/em>\n        <em># -------------------------------------------------------------------------------------<\/em>\n        <em># \u8ba1\u7b97\u65b0\u65e7\u7b56\u7565\u95f4\u7684KL\u6563\u5ea6<\/em>\n        <em># -------------------------------------------------------------------------------------<\/em>\n        log_ratio = (logprobs - old_logprobs) * mask\n        ratio = torch.exp(log_ratio)\n        <em># -------------------------------------------------------------------------------------<\/em>\n        <em># \u8ba1\u7b97\u539f\u59cbloss\u548c\u622a\u65adloss<\/em>\n        <em># -------------------------------------------------------------------------------------<\/em>\n        pg_loss1 = -advantages * ratio\n        pg_loss2 = -advantages * torch.clamp(ratio, 1.0 - self.cliprange, 1.0 + self.cliprange)\n        pg_loss = torch.sum(torch.max(pg_loss1, pg_loss2) * mask) \/ mask.sum() <em># \u6700\u540e\u662f\u53d6\u6bcf\u4e2a\u975emask\u7684response token\u7684\u5e73\u5747loss\u4f5c\u4e3a\u6700\u7ec8loss<\/em>\n        return pg_loss\n\n<\/code><\/pre>\n\n\n\n<p id=\"h_677607581_16\"><strong>\uff086\uff09Actor loss\u5c0f\u7ed3<\/strong><\/p>\n\n\n\n<p>\uff081\uff09\uff5e\uff085\uff09\u4e2d\u6211\u4eec\u4e00\u6b65\u6b65\u6811\u7acb\u4e86actor_loss\u7684\u6539\u8fdb\u8fc7\u7a0b\uff0c\u8fd9\u91cc\u6211\u4eec\u5c31\u505a\u4e00\u4e2a\u603b\u7ed3\u5427\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-129.png\" alt=\"\" class=\"wp-image-25411\" width=\"496\" height=\"112\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-129.png 796w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-129-300x68.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-129-768x175.png 768w\" sizes=\"(max-width: 496px) 100vw, 496px\" \/><\/figure>\n\n\n\n<ul><li><strong>\u6211\u4eec\u5df2\u7ecf\u5bf9<\/strong>Rt<strong>\u8fdb\u884c\u6765\u6539\u9020\uff0c\u4f7f\u5176\u80fd\u591f\u8861\u91cfActor\u6a21\u578b\u662f\u5426\u9075\u4ece\u4e86Ref\u6a21\u578b\u7684\u7ea6\u675f<\/strong><\/li><li><strong>\u6211\u4eec\u5df2\u7ecf\u5bf9<\/strong>Advt<strong>\u8fdb\u884c\u6539\u9020\uff0c\u4f7f\u5176\u4e0d\u4ec5\u8003\u8651\u4e86\u5f53\u524d\u65f6\u523b\u7684\u4f18\u52bf\uff0c\u8fd8\u8003\u8651\u4e86\u672a\u6765\u7684\u4f18\u52bf<\/strong><\/li><li><strong>\u6211\u4eec\u91cd\u590d\u5229\u7528\u4e861\u4e2abatch\u7684\u6570\u636e\uff0c\u4f7f\u672c\u6765\u53ea\u80fd\u88ab\u7528\u6765\u505a1\u6b21\u6a21\u578b\u66f4\u65b0\u7684\u5b83\u73b0\u5728\u80fd\u88ab\u7528\u6765\u505appo_epochs\u6b21\u6a21\u578b\u66f4\u65b0\u3002\u6211\u4eec\u4f7f\u7528\u771f\u6b63\u5403\u4e86batch\uff0c\u4ea7\u51fa\u7ecf\u9a8c\u503c\u7684\u90a3\u4e2a\u65f6\u523b\u7684Actor\u5206\u5e03\u6765\u7ea6\u675fppo_epochs\u4e2d\u66f4\u65b0\u7684Actor\u5206\u5e03<\/strong><\/li><li><strong>\u6211\u4eec\u8003\u8651\u4e86\u526a\u88c1\u673a\u5236\uff08clip\uff09\uff0c\u5728ppo_epochs\u6b21\u66f4\u65b0\u4e2d\uff0c\u4e00\u65e6Actor\u7684\u66f4\u65b0\u5e45\u5ea6\u8d85\u8fc7\u6211\u4eec\u7684\u63a7\u5236\u8303\u56f4\uff0c\u5219\u4e0d\u5bf9\u5b83\u8fdb\u884c\u53c2\u6570\u66f4\u65b0\u3002<\/strong><\/li><\/ul>\n\n\n\n<h4>Critic loss<\/h4>\n\n\n\n<p>\u6211\u4eec\u77e5\u9053\uff0c1\u4e2abatch\u4ea7\u51fa\u7684\u7ecf\u9a8c\u503c\uff0c\u4e0d\u4ec5\u88ab\u7528\u6765\u66f4\u65b0Actor\uff0c\u8fd8\u88ab\u7528\u6765\u66f4\u65b0Critic\u3002\u5bf9\u4e8eCritic loss\uff0c\u6211\u4eec\u4e0d\u518d\u50cfActor loss\u4e00\u6837\u7ed9\u51fa\u4e00\u4e2a\u201c\u6f14\u53d8\u8fc7\u7a0b\u201d\u7684\u89e3\u8bfb\uff0c\u6211\u4eec\u76f4\u63a5\u6765\u770b\u5b83\u6700\u540e\u7684\u8bbe\u8ba1\u3002<br><br>\u9996\u5148\uff0c\u5728\u4e4b\u524d\u7684\u89e3\u8bf4\u4e2d\uff0c\u4f60\u53ef\u80fd\u6709\u8fd9\u6837\u4e00\u4e2a\u5370\u8c61\uff1a<\/p>\n\n\n\n<ul><li>Vt&nbsp;\uff1aCritic\u5bf9t\u65f6\u523b\u7684\u603b\u6536\u76ca\u7684\u9884\u4f30\uff0c\u8fd9\u4e2a\u603b\u6536\u76ca\u5305\u542b\u5373\u65f6\u548c\u672a\u6765\u7684\u6982\u5ff5\uff08\u9884\u4f30\u6536\u76ca\uff09<\/li><li>Rt+\u03b3\u2217Vt+1&nbsp;\uff1aReward\u8ba1\u7b97\u51fa\u7684\u5373\u65f6\u6536\u76ca&nbsp;Rt&nbsp;\uff0cCritic\u9884\u6d4b\u51fa\u7684&nbsp;t+1&nbsp;\u53ca\u4e4b\u540e\u65f6\u5019\u7684\u6536\u76ca\u7684\u6298\u73b0\uff0c\u8fd9\u662f\u6bd4&nbsp;Vt&nbsp;\u66f4\u63a5\u8fd1t\u65f6\u523b\u771f\u503c\u603b\u6536\u76ca\u7684\u4e00\u4e2a\u503c\uff08\u5b9e\u9645\u6536\u76ca\uff09<\/li><\/ul>\n\n\n\n<p>\u6240\u4ee5\uff0c\u6211\u4eec\u7684\u7b2c\u4e00\u60f3\u6cd5\u662f\uff1a<\/p>\n\n\n\n<div class=\"wp-block-image\"><figure class=\"alignleft size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-130.png\" alt=\"\" class=\"wp-image-25413\" width=\"283\" height=\"25\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-130.png 427w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-130-300x26.png 300w\" sizes=\"(max-width: 283px) 100vw, 283px\" \/><\/figure><\/div>\n\n\n\n<p><\/p>\n\n\n\n<p>\u73b0\u5728\uff0c\u6211\u4eec\u5bf9\u201c\u5b9e\u9645\u6536\u76ca\u201d\u548c\u201c\u9884\u4f30\u6536\u76ca\u201d\u90fd\u505a\u4e00\u4e9b\u4f18\u5316\u3002<\/p>\n\n\n\n<p id=\"h_677607581_18\"><strong>\uff081\uff09\u5b9e\u9645\u6536\u76ca\u4f18\u5316<\/strong><\/p>\n\n\n\n<p>\u6211\u4eec\u539f\u59cb\u7684\u5b9e\u9645\u6536\u76ca\u4e3a&nbsp;Rt+\u03b3\u2217Vt+1&nbsp;\uff0c\u4f46\u662f\u5f53\u6211\u4eec\u5728actor_loss\u4e2d\u5f15\u5165\u201c\u4f18\u52bf\u201d\u7684\u6982\u5ff5\u65f6\uff0c\u201c\u4f18\u52bf\u201d\u4e2d\u523b\u753b\u4e86\u66f4\u4e3a\u4e30\u5bcc\u7684\u5b9e\u65f6\u6536\u76ca\u4fe1\u606f\uff0c\u6240\u4ee5\uff0c\u6211\u4eec\u5c06\u5b9e\u9645\u6536\u76ca\u4f18\u5316\u4e3a\uff1a&nbsp;Advt+Vt<\/p>\n\n\n\n<p id=\"h_677607581_19\"><strong>\uff082\uff09\u9884\u4f30\u6536\u76ca\u4f18\u5316<\/strong><\/p>\n\n\n\n<p>\u6211\u4eec\u539f\u59cb\u7684\u9884\u4f30\u6536\u76ca\u4e3a&nbsp;Vt&nbsp;\u3002<br>\u7c7b\u6bd4\u4e8eActor\uff0cCritic\u6a21\u578b\u5728ppo_epochs\u7684\u8fc7\u7a0b\u4e2d\u4e5f\u662f\u4e0d\u65ad\u66f4\u65b0\u7684\u3002\u6240\u4ee5\u8fd9\u4e2a&nbsp;Vt&nbsp;\u53ef\u4ee5\u7406\u89e3\u6210\u662f&nbsp;Criticold&nbsp;\uff0c\u4e5f\u5c31\u662f\u771f\u6b63\u5403\u4e86batch\uff0c\u53c2\u4e0e\u4ea7\u51fa\u7ecf\u9a8c\u7684\u90a3\u4e2a\u65f6\u5019\u7684Critic\u4ea7\u51fa\u7684\u6536\u76ca\u9884\u6d4b\u7ed3\u679c\u3002<br><br><br>\u6211\u4eec\u540c\u6837\u60f3\u7528\u65e7\u6a21\u578b\u53bb\u7ea6\u675f\u65b0\u6a21\u578b\uff0c\u4f46\u5bf9\u4e8eCritic\u6211\u4eec\u91c7\u7528\u7684\u7ea6\u675f\u7b56\u7565\u5c31\u6bd4\u8f83\u7b80\u5355\u4e86\uff0c\u6211\u4eec\u76f4\u63a5\u770b\u4ee3\u7801\uff0c\u4ece\u4e2d\u53ef\u4ee5\u770b\u51fa\uff0c\u6211\u4eec\u7528\u8001&nbsp;Vt&nbsp;\u8bbe\u8ba1\u4e86\u4e86\u4e00\u4e2a\u53d8\u52a8\u8303\u56f4\uff0c\u7136\u540e\u7528\u8fd9\u4e2a\u53d8\u52a8\u8303\u56f4\u53bb\u7ea6\u675f\u65b0&nbsp;Vt<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code><em># self.cliprange_value\u662f\u4e00\u4e2a\u5e38\u91cf<\/em>\n<em># old_values: \u8001critic\u7684\u9884\u6d4b\u7ed3\u679c<\/em>\n<em># values\uff1a\u65b0critic\u7684\u9884\u6d4b\u7ed3\u679c<\/em>\nvalues_clipped = torch.clamp(\n            values,\n            old_values - self.cliprange_value,\n            old_values + self.cliprange_value,\n        )<\/code><\/pre>\n\n\n\n<p>\u90a3\u4e48\u6700\u7ec8\u6211\u4eec\u5c31\u53d6\u5b9e\u9645\u6536\u76ca\u548c\u9884\u4f30\u6536\u76ca\u7684MSE\u505a\u4e3aloss\u5c31\u597d\uff0c\u8fd9\u91cc\u6ce8\u610f\uff0c\u8ba1\u7b97\u5b9e\u9645\u6536\u76ca\u65f6&nbsp;Advt,Vt&nbsp;\u90fd\u662f\u8001Critic\uff08\u771f\u6b63\u5403\u4e86batch\u7684\u90a3\u4e2a\uff09\u4ea7\u51fa\u7684\u7ed3\u679c\uff0c\u800c\u9884\u4f30\u6536\u76ca\u662f\u968f\u7740ppo_epochs\u800c\u53d8\u52a8\u7684\u3002<\/p>\n\n\n\n<p><br>\u4ee3\u7801\u5982\u4e0b\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>def critic_loss_fn(self, values, old_values, returns, mask):\n        \"\"\"\n        values: \u5b9e\u65f6critic\u8dd1\u51fa\u6765\u7684\u9884\u4f30\u9884\u671f\u6536\u76ca\uff08\u662f\u53d8\u52a8\u7684\uff0c\u968f\u7740ppo epoch\u8fed\u4ee3\u800c\u6539\u53d8\uff09\n        old_values\uff1a\u8001critic\u8dd1\u51fa\u6765\u7684\u9884\u4f30\u9884\u671f\u6536\u76ca\uff08\u662f\u56fa\u5b9a\u503c\uff09\n        returns\uff1a\u5b9e\u9645\u9884\u671f\u6536\u76ca\n        mask\uff1aresponse\u90e8\u5206\u7684mask\n        \n        self.cliprange_value = 0.2\n        \"\"\"\n        <em>## value loss<\/em>\n        <em># \u7528\u65e7\u7684value\u53bb\u7ea6\u675f\u65b0\u7684value<\/em>\n        values_clipped = torch.clamp(\n            values,\n            old_values - self.cliprange_value,\n            old_values + self.cliprange_value,\n        )\n        if self.compute_fp32_loss:\n            values = values.float()\n            values_clipped = values_clipped.float()\n        \n        <em># critic\u6a21\u578b\u7684loss\u5b9a\u4e49\u4e3a\uff08\u9884\u4f30\u9884\u671f\u6536\u76ca-\u5b9e\u9645\u9884\u671f\u6536\u76ca\uff09**2<\/em>\n        vf_loss1 = (values - returns)**2\n        vf_loss2 = (values_clipped - returns)**2\n        vf_loss = 0.5 * torch.sum(\n            torch.max(vf_loss1, vf_loss2) * mask) \/ mask.sum() <em># \u540c\u6837\uff0c\u6700\u540e\u4e5f\u662f\u628acritic loss\u5e73\u5747\u5230\u6bcf\u4e2atoken\u4e0a<\/em>\n        return vf_loss<\/code><\/pre>\n\n\n\n<h3 class=\"has-medium-pink-color has-text-color\"><strong><a href=\"https:\/\/github.com\/zhaochenyang20\/Awesome-ML-SYS-Tutorial\/tree\/main\/rlhf\/OpenRLHF#rlhf-%E7%9A%84%E8%AE%A1%E7%AE%97%E6%B5%81\" target=\"_blank\" rel=\"noreferrer noopener\">\u603b\u7ed3\uff1aRLHF \u7684\u8ba1\u7b97\u6d41<\/a><\/strong><\/h3>\n\n\n\n<h4>\u6784\u9020 Reward<a href=\"https:\/\/github.com\/zhaochenyang20\/Awesome-ML-SYS-Tutorial\/tree\/main\/rlhf\/OpenRLHF#%E6%9E%84%E9%80%A0-reward\"><\/a><\/h4>\n\n\n\n<p>\u7ed9\u5b9a\u4e00\u4e2a transformer \u548c\u4efb\u4f55\u4e00\u4e2a string\uff0c\u6211\u90fd\u53ef\u4ee5\u5c06\u6574\u4e2a string \u8f93\u5165\u7ed9 reward model \u505a\u4e00\u6b21 forward pass\uff0c\u5f97\u5230\u6bcf\u4e2a\u4f4d\u7f6e\u7684 token \u7684 logit\u3002\u6211\u4eec\u53d6\u51fa\u6700\u540e\u4e00\u4e2a token \u7684 logit\uff0c\u7ecf\u8fc7 logit processor \u5904\u7406\uff0c\u518d\u8fc7\u4e00\u6b21 softmax \u5e76\u53d6 log\uff0c\u5f97\u5230\u6b64\u5904\u7684 log prob\u3002\u6b64\u5916\uff0c\u6211\u4eec\u4e5f\u53ef\u4ee5\u5bf9\u6700\u540e\u4e00\u4e2a token \u7684 logit \u8fdb\u884c\u5176\u4ed6\u64cd\u4f5c\uff0c\u8b6c\u5982 pooling \u548c projection \u7b49\u7b49\uff0c\u62ff\u5230 embedding\u3001reward \u6216\u8005 value\u3002\u7531\u6b64\u53ef\u89c1\uff0c\u5bf9\u4e8e string \u91cc\u7684\u6bcf\u4e2a token\uff0c\u6211\u4eec\u90fd\u53ef\u4ee5\u5f97\u5230\u524d\u8ff0\u6240\u6709\u8ba1\u7b97\u503c\uff0c\u4f46\u662f<strong>\u5728 RLHF \u4e2d\uff0c\u6211\u4eec\u4f1a\u7528\u5230 response \u4e2d\u6bcf\u4e2a token \u7684 log prob \u548c value\uff0c\u4f46\u662f reward \u6a21\u578b\u53ea\u4f1a\u7528\u6700\u540e\u4e00\u4e2a token \u7684 reward<\/strong>\u3002\u8fd9\u91cc\u76f4\u63a5\u7ed9\u51fa reward \u7684\u5b9e\u9645\u8ba1\u7b97\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-29.png\" alt=\"\" class=\"wp-image-27740\" width=\"583\" height=\"423\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-29.png 822w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-29-300x218.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-29-768x558.png 768w\" sizes=\"(max-width: 583px) 100vw, 583px\" \/><\/figure>\n\n\n\n<pre class=\"wp-block-preformatted\">\u4e3a\u4ec0\u4e48\u53ea\u6709\u6700\u540e\u4e00\u4e2a\u65f6\u523b\u7684 Rt\u88ab\u7eb3\u5165\u4e86\u8003\u91cf\u5462\uff1f\u8fd9\u662f\u56e0\u4e3a\u5728Reward\u6a21\u578b\u8bad\u7ec3\u9636\u6bb5\uff0c\u5c31\u662f\u7528\u8fd9\u4e2a\u4f4d\u7f6e\u7684 Rt\u6765\u8868\u793a\u5bf9\u5b8c\u6574\u7684prompt + response\u7684\u5956\u52b1\u9884\u6d4b\uff08\u4f46\u4e0d\u59a8\u788d\u4f60\u7406\u89e3\u6210\u662f\u6267\u884c\u5b8c AT\u7684\u5373\u65f6\u5956\u52b1\uff09\uff0c\u7136\u540e\u7528\u8fd9\u4e2a\u6307\u6807\u6765\u505a\u6a21\u578beval\u7684\uff08\u4f46\u662fReward\u8bad\u7ec3\u9636\u6bb5\u7b97loss\u65f6\uff0c\u8fd8\u662f\u8003\u8651\u4e86response\u90e8\u5206\u6240\u6709token\u8f93\u51fa\u7684reward\u503c\uff09\u3002\u6240\u4ee5\u5230\u4e86RLHF\u7684\u573a\u666f\u4e0b\uff0c\u5176\u4f59\u65f6\u523b\u7684\u5373\u65f6\u5956\u52b1\uff0c\u6211\u4eec\u5c31\u7528\u201cActor\u662f\u5426\u9075\u5faa\u4e86Ref\u7684\u7ea6\u675f\u201d\u6765\u8fdb\u884c\u8bc4\u4ef7\u3002\n\n\u9700\u8981\u6ce8\u610f\u7684\u662f\uff0cRt\u7684\u8bbe\u8ba1\u5e76\u4e0d\u53ea\u6709\u8fd9\u4e00\u79cd\u3002deepspeed\u5728\u81ea\u5df1\u7684\u4ee3\u7801\u6ce8\u91ca\u4e2d\u4e5f\u6709\u63d0\u8fc7\uff0c\u53ef\u4ee5\u5c1d\u8bd5\u628a\u6700\u540e\u4e00\u4e2a\u65f6\u523b\u7684 RT\u66ff\u6362\u6210\u6240\u6709token\u7684\u5373\u65f6\u5956\u52b1\u7684\u5e73\u5747\u503c\u3002\u5982\u679c\u7ad9\u5728\u8fd9\u4e2a\u89d2\u5ea6\u7406\u89e3\u7684\u8bdd\uff0c\u6211\u4eec\u540c\u6837\u4e5f\u53ef\u4ee5\u5c1d\u8bd5\u5728\u6bcf\u4e00\u4e2a\u4f4d\u7f6e\u7684\u5956\u52b1\u8861\u91cf\u4e0a\u5f15\u5165 Rt<\/pre>\n\n\n\n<p>\u5bf9\u4e8e\u7b2c t \u4e2a response token\uff0c\u5f53 t \u4e3a\u6700\u540e\u4e00\u4e2a token T \u65f6\uff0c\u624d\u5c06 reward model \u8f93\u51fa\u7684\u5bf9\u6574\u4e2a response \u7684 reward \u52a0\u5230&nbsp;Rt&nbsp;\u4e0a\u3002\u6362\u8a00\u4e4b\uff0c\u5b9e\u9645\u4e0a\u4e00\u4e2a prompt + response \u53ea\u4f1a\u8ba9 reward model \u63a8\u7406\u4e00\u6b21\uff0c\u4f5c\u4e3a\u6574\u4e2a response \u7684 reward\u3002<\/p>\n\n\n\n<p>\u81f3\u4e8e\u5176\u4ed6\u90e8\u5206\uff0c$kl _ ctl$ \u662f\u4e2a\u5e38\u6570\uff0c$ \\log \\frac{P(A_t|S_t)}{P_{ref}(A_t|S_t)} $ \u662f reference model \u548c actor model \u751f\u6210&nbsp;At&nbsp;\u8fd9\u4e2a token \u7684\u6761\u4ef6\u6982\u7387\u6bd4\u503c\u53d6\u5bf9\u6570\uff0c\u4e5f\u5373\u76f4\u63a5\u5c06 actor \u7684 log prob \u548c reference \u7684 log prob \u76f8\u51cf\uff0c\u4f53\u73b0\u5230\u4ee3\u7801\u91cc\u5c31\u662f&nbsp;<code>kl_ctl * (actor_log_probs - ref_log_probs)<\/code>\uff08KL \u6563\u5ea6\uff09\uff0c\u8fd9\u6837\u5c31\u5f97\u5230\u4e86\u6bcf\u4e2a token \u7684 reward\u3002\u6ce8\u610f\u8fd9\u91cc\u7684\u5355\u590d\u6570\uff0c<code>actor_log_probs<\/code>&nbsp;\u548c&nbsp;<code>ref_log_probs<\/code>&nbsp;\u90fd\u662f\u6240\u6709 response token \u7684 log prob \u6784\u6210\u7684 list\u3002<\/p>\n\n\n\n<p>\u5f97\u5230 KL \u6563\u5ea6\u540e\uff0c\u518d\u5728\u8fd9\u4e2a&nbsp;<code>prompt + response<\/code>&nbsp;\u7684\u6700\u540e\u4e00\u4e2a token \u4e0a\u52a0\u4e0a\u6b64\u5904\u7684 reward\uff08\u79f0\u4e3a reward score\uff09\uff0c\u6574\u4e2a response \u6bcf\u4e00\u5904\u7684 reward \u4fbf\u6784\u9020\u5b8c\u6210\u4e86\u3002\u5f53\u7136\uff0c\u5b9e\u9645\u4e0a\u7684\u8ba1\u7b97\u8fd8\u9700\u8981\u8003\u8651 reward score \u7684 clip \u95ee\u9898\uff0c\u4e5f\u5373\u4e0d\u80fd\u8ba9 reward \u8fc7\u5927\u3002\u5728<a href=\"https:\/\/zhuanlan.zhihu.com\/p\/677607581\">\u77e5\u4e4e<\/a>\u91cc\u9762\u7ed9\u4e86\u975e\u5e38\u597d\u7684\u4f2a\u4ee3\u7801\u3002<\/p>\n\n\n\n<h4>\u6784\u9020 Advantage<\/h4>\n\n\n\n<p>Advanatage \u53ef\u4ee5\u67d0\u79cd\u7a0b\u5ea6\u7406\u89e3\u4e3a\u201c\u610f\u5916\u4e4b\u559c\u201d\uff0c\u5177\u4f53\u7684\u63cf\u8ff0\u53c2\u8003\u77e5\u4e4e\u539f\u6587\u3002\u8fd9\u91cc\u76f4\u63a5\u7ed9\u51fa Advantage \u7684\u6784\u9020\u516c\u5f0f\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"435\" height=\"57\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-30.png\" alt=\"\" class=\"wp-image-27743\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-30.png 435w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-30-300x39.png 300w\" sizes=\"(max-width: 435px) 100vw, 435px\" \/><\/figure>\n\n\n\n<p>\u6211\u4eec\u6765\u62c6\u89e3\u4e0b\uff0c\u8003\u8651\u5230&nbsp;Rt&nbsp;\u662f\u6bcf\u4e2a token \u7684 reward\uff0c\u524d\u9762\u5df2\u7ecf\u6784\u9020\u4e86\u3002Vt \u548c&nbsp;Vt+1&nbsp;\u662f\u5f53\u524d token \u548c\u4e0b\u4e00\u4e2a token \u7684 value\uff0c\u800c\u6bcf\u4e2a token \u7684 value \u5728 value model \u7684 forward pass \u4e2d\u90fd\u53ef\u4ee5\u5f97\u5230\uff0cAdv_t \u662f\u5f53\u524d token \u7684 advantage\uff0c$\\gamma, \\lambda$ \u90fd\u662f\u5e38\u6570\u3002\u8fd9\u79cd\u9012\u5f52\u7684\u6784\u9020\u65b9\u5f0f\uff0c\u53ef\u4ee5\u7528\u5c3e\u9012\u5f52\u6765\u53cd\u63a8\u6bcf\u4e2a\u4f4d\u7f6e\u7684 advantage\u3002<\/p>\n\n\n\n<h4>\u6784\u9020 Actor Loss<a href=\"https:\/\/github.com\/zhaochenyang20\/Awesome-ML-SYS-Tutorial\/tree\/main\/rlhf\/OpenRLHF#%E6%9E%84%E9%80%A0-actor-loss\"><\/a><\/h4>\n\n\n\n<p>\u8fd9\u91cc\u8fd8\u662f\u76f4\u63a5\u7ed9\u51fa Actor Loss \u7684\u6784\u9020\u516c\u5f0f\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"694\" height=\"79\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-31.png\" alt=\"\" class=\"wp-image-27745\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-31.png 694w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-31-300x34.png 300w\" sizes=\"(max-width: 694px) 100vw, 694px\" \/><\/figure>\n\n\n\n<p>\u8fd9\u4e2a\u6784\u9020\u516c\u5f0f\u770b\u7740\u590d\u6742\uff0c\u5b9e\u9645\u4e0a\u4e00\u70b9\u4e5f\u4e0d\u7b80\u5355\u3002\u6bcf\u4e2a response token \u7684&nbsp;Advt&nbsp;\u7684\u6784\u9020\u5df2\u7ecf\u5728\u524d\u6587\u7ed9\u51fa\uff0c\u800c&nbsp;P(At|St),Pold(At|St)&nbsp;\u5176\u5b9e\u90fd\u662f actor model \u7684\u6761\u4ef6\u6982\u7387\u3002\u4e4b\u6240\u4ee5\u6709\u4e2a old \u662f\u56e0\u4e3a\u6211\u4eec\u5e0c\u671b\u591a\u5229\u7528\u6bcf\u8f6e\u4ea7\u751f\u7684 experiences\uff0c\u56e0\u6b64\u4e00\u7ec4 experiences \u4f1a\u66f4\u65b0\u591a\u8f6e\u3002old \u8868\u793a\u8fd9\u4e00\u7ec4 experiences \u7528\u4e8e\u66f4\u65b0\u4e4b\u524d\u7684 actor model\uff0c\u7528\u8fd9\u4e2a old actor model \u5bf9\u8fd9\u51e0\u8f6e\u66f4\u65b0\u7684\u5927\u5c0f\u505a\u4e86\u7ea6\u675f\u3002<strong>\u6700\u540e\uff0c\u8003\u8651\u5230\u67d0\u4e00\u8f6e\u66f4\u65b0\u91cc\uff0c\u5f53\u524d actor model \u548c old actor model \u7684\u5dee\u8ddd\u5b9e\u5728\u592a\u5927\u4e86\uff0c\u4ee5\u81f3\u4e8e\u6761\u4ef6\u6982\u7387\u7684\u6bd4\u503c\u8d85\u51fa\u4e86\u4eba\u4e3a\u9884\u8bbe\u7684\u8303\u56f4\uff0c\u6b64\u65f6&nbsp;Advt&nbsp;\u7684\u7cfb\u6570\uff08ratio\uff09\u4f1a\u53d6\u4e3a\u7ea6\u675f\u8fb9\u754c\u3002\u6b64\u65f6 actor model \u7684\u53c2\u6570\u4e0d\u518d\u5f71\u54cd ratio\uff0c\u6362\u8a00\u4e4b actor model \u7684\u53c2\u6570\u4e0d\u518d\u5728 actor loss \u7684\u8ba1\u7b97\u56fe\u4e2d\u4e86\uff0c\u8fd9\u4e2a loss \u4e5f\u5c31\u4e0d\u4f1a\u66f4\u65b0 actor \u7684\u53c2\u6570\u4e86\u3002<\/strong>&nbsp;\u6ce8\u610f\uff0cadvantage \u7684\u6784\u9020\u662f\u7531 old actor model \u6784\u9020\u6765\u7684\uff0c\u8ba1\u7b97\u7ed3\u675f\u5c31\u56fa\u5b9a\u4e86\uff0c\u5bf9\u4e8e\u66f4\u65b0\u4e2d\u7684 actor model \u6ca1\u6709\u68af\u5ea6\uff0c\u6240\u4ee5\u6574\u4e2a actor loss \u7684\u8ba1\u7b97\u56fe\u4e2d\u53ea\u6709 ratio \u5bf9\u66f4\u65b0\u4e2d\u7684 actor model \u6709\u68af\u5ea6\u3002<\/p>\n\n\n\n<h4>\u6784\u9020 Critic Loss<\/h4>\n\n\n\n<p>\u6ce8\u610f\u5230\uff0c\u5728 advantage \u7684\u6784\u9020\u4e2d\uff0c\u6211\u4eec\u4e00\u5e76\u5f97\u5230\u4e86&nbsp;<code>returns<\/code>\uff0c\u5c06\u5176\u89c6\u4e3a\u6bcf\u4e2a token \u7684\u5b9e\u9645\u6536\u76ca\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"747\" height=\"55\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-32.png\" alt=\"\" class=\"wp-image-27749\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-32.png 747w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-32-300x22.png 300w\" sizes=\"(max-width: 747px) 100vw, 747px\" \/><\/figure>\n\n\n\n<p>\u800c\u9884\u4f30\u6536\u76ca\u5c31\u662f&nbsp;Vt\uff0c\u7136\u540e\u6211\u4eec\u6784\u9020 MSE loss \u6765\u6700\u5c0f\u5316\u9884\u4f30\u6536\u76ca\u548c\u5b9e\u9645\u6536\u76ca\u7684\u5dee\u8ddd\u3002<\/p>\n\n\n\n<div class=\"wp-block-image\"><figure class=\"aligncenter size-full\"><img loading=\"lazy\" width=\"249\" height=\"43\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-33.png\" alt=\"\" class=\"wp-image-27750\"\/><\/figure><\/div>\n\n\n\n<p>\u770b\u4e0a\u53bb\u4f3c\u4e4e&nbsp;<em>Ret<sub>t<\/sub>\u2212Vt<\/em>&nbsp;\u5c31\u662f<em>&nbsp;Advt<\/em>\uff0c<strong>\u4f46\u662f\u5b9e\u9645\u4f7f\u7528\u7684&nbsp;<code>values<\/code>&nbsp;\u662f\u591a\u8f6e\u66f4\u65b0\u4e2d\u7684 value model \u7684\u8f93\u51fa\uff0c\u4e5f\u5373 new value\uff0c\u800c&nbsp;<code>returns<\/code>&nbsp;\u662f\u591a\u8f6e\u66f4\u65b0\u5f00\u59cb\u65f6\u5c31\u56fa\u5b9a\u4e86\u7684\u5b9e\u9645\u6536\u76ca\uff08old returns\uff09\uff0c\u6240\u4ee5&nbsp;<em>Ret<sub>t<\/sub>\u2212Vt&nbsp;\u5e76\u4e0d\u662f&nbsp;Advt<\/em>\u3002<\/strong><a href=\"https:\/\/github.com\/zhaochenyang20\/Awesome-ML-SYS-Tutorial\/tree\/main\/rlhf\/OpenRLHF#%E6%9E%84%E9%80%A0-advantage\"><\/a><\/p>\n\n\n\n<h4>\u66f4\u65b0\u6d41\u7a0b<a href=\"https:\/\/github.com\/zhaochenyang20\/Awesome-ML-SYS-Tutorial\/tree\/main\/rlhf\/OpenRLHF#%E6%9B%B4%E6%96%B0%E6%B5%81%E7%A8%8B\"><\/a><\/h4>\n\n\n\n<ol><li>\u51c6\u5907\u4e00\u4e2a batch \u7684&nbsp;<code>prompts<\/code>\uff1b<\/li><li>\u5c06\u8fd9\u4e2a batch \u7684&nbsp;<code>prompts<\/code>&nbsp;\u8f93\u5165\u7ed9 Actor\uff0c\u89e3\u7801\u5f97\u5230&nbsp;<code>responses<\/code>\uff1b<\/li><li>\u5c06&nbsp;<code>prompt + responses<\/code>&nbsp;\u8f93\u5165\u7ed9 Critic\/Reward\/Reference\uff0c\u5206\u522b\u8ba1\u7b97\u5f97\u5f97\u5230\u6240\u6709 token \u7684 values\u3001\u6700\u540e\u4e00\u4e2a token \u7684 reward \u548c\u6240\u6709 token \u7684 log probs\uff0c\u6309\u7167\u5f3a\u5316\u5b66\u4e60\u7684\u672f\u8bed\uff0c\u79f0\u8fd9\u4e9b\u6570\u636e\u4e3a\u7ecf\u9a8c\uff08experiences\uff09\u4e86\uff1b<\/li><li>\u6839\u636e experiences \u591a\u8f6e\u8ba1\u7b97 actor loss \u548c critic loss \u5e76\u66f4\u65b0 Actor \u548c Critic \u6a21\u578b\u3002<\/li><\/ol>\n\n\n\n<p>\u5bf9\u4e8e\u7b2c 4 \u6b65\uff0c\u6211\u4eec\u5f53\u7136\u53ef\u4ee5\u4e00\u8f6e experiences \u5c31\u66f4\u65b0\u4e00\u6b21 actor \u548c critic\uff0c\u4f46\u662f\u4e3a\u4e86\u5c3d\u53ef\u80fd\u5229\u7528\u8fd9\u4e2a batch \u7684 experiences\uff0c\u6211\u4eec\u5bf9 actor \u548c critic \u505a\u591a\u8f6e\u66f4\u65b0\u3002\u6211\u4eec\u5c06 experiences \u4e2d\u591a\u8f6e\u66f4\u65b0\u5f00\u59cb\u524d\u7684 log probs \u548c values \u79f0\u4e3a old log probs \u548c old values\uff08reward \u4e0d\u4f1a\u591a\u8f6e\u8ba1\u7b97\uff09\u3002\u5728\u6bcf\u4e00\u8f6e\u4e2d\uff0cactor \u548c critic \u4f1a\u751f\u6210 new log probs \u548c new values\uff0c\u7136\u540e\u5728 old \u7684\u57fa\u7840\u4e0a\u8ba1\u7b97 actor loss \u548c critic loss\uff0c\u7136\u540e\u66f4\u65b0\u53c2\u6570\u3002<\/p>\n\n\n\n<h4>\u6574\u4f53\u6d41\u7a0b\uff1a<\/h4>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"1024\" height=\"781\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/make-experience-1024x781.png\" alt=\"\" class=\"wp-image-27756\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/make-experience-1024x781.png 1024w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/make-experience-300x229.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/make-experience-768x586.png 768w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/make-experience-1536x1171.png 1536w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/make-experience.png 1608w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"1024\" height=\"765\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/learning-stage-1024x765.png\" alt=\"\" class=\"wp-image-27755\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/learning-stage-1024x765.png 1024w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/learning-stage-300x224.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/learning-stage-768x573.png 768w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/learning-stage-1536x1147.png 1536w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/learning-stage.png 1618w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<h3>PPO\u4f18\u5316\u76ee\u6807<\/h3>\n\n\n\n<p>\uff081\uff09\u7b56\u7565\u68af\u5ea6\u7b97\u6cd5\uff1a\u66f4\u65b0\u5e45\u5ea6\u5927\uff0c\u4e0d\u7a33\u5b9a<\/p>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-100.png\" alt=\"\" class=\"wp-image-25282\" width=\"317\" height=\"53\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-100.png 374w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-100-300x50.png 300w\" sizes=\"(max-width: 317px) 100vw, 317px\" \/><\/figure>\n\n\n\n<p>\uff082\uff09TRPO\uff08\u4fe1\u4efb\u533a\u57df\u7b97\u6cd5\uff09\uff1a\u52a0\u5165KL\u6563\u5ea6\u7ea6\u675f\u6761\u4ef6\uff0c\u4f46\u9700\u8ba1\u7b97\u4e8c\u9636\u5bfc\u6570\uff0c\u8ba1\u7b97\u91cf\u5927<\/p>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-101.png\" alt=\"\" class=\"wp-image-25284\" width=\"583\" height=\"64\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-101.png 827w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-101-300x33.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-101-768x85.png 768w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-101-825x92.png 825w\" sizes=\"(max-width: 583px) 100vw, 583px\" \/><\/figure>\n\n\n\n<p>\uff083\uff09PPO\uff08\u8fd1\u7aef\u7b56\u7565\u4f18\u5316\u7b97\u6cd5\uff09\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-102.png\" alt=\"\" class=\"wp-image-25285\" width=\"531\" height=\"59\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-102.png 603w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-102-300x33.png 300w\" sizes=\"(max-width: 531px) 100vw, 531px\" \/><\/figure>\n\n\n\n<p>\u8fd9\u91ccAt\u4e3a\u4f18\u52bf\u51fd\u6570\uff1a<strong><em>Critic Model\u7528\u4e8e\u4f30\u8ba1\u72b6\u6001\u7684\u4ef7\u503c\u51fd\u6570 V(st)\uff0c\u4ece\u800c\u8ba1\u7b97\u7b56\u7565\u68af\u5ea6\u4e2d\u7684\u4f18\u52bf\u503cA(t)\uff0c<\/em>\u4e0b\u9762\u7684 <\/strong><em><strong>r(st,at) <\/strong><\/em><strong>\u51fd\u6570\u5c31\u662f RM \u6a21\u578b\u7684\u8f93\u51fa\uff1a \u7528\u4e8e\u8ba1\u7b97\u751f\u6210<\/strong>\u67d0\u4e2a<strong>token\u7684\u5373\u65f6\u6536\u76ca \u3002<\/strong> <strong>\u4e0b\u56fe\u8f6c\u6362\u53c2\u8003\uff1a<a rel=\"noreferrer noopener\" href=\"https:\/\/zhuanlan.zhihu.com\/p\/651780908\" target=\"_blank\">https:\/\/zhuanlan.zhihu.com\/p\/651780908<\/a><\/strong><\/p>\n\n\n\n<div class=\"wp-block-image\"><figure class=\"aligncenter size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-114.png\" alt=\"\" class=\"wp-image-25345\" width=\"524\" height=\"191\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-114-300x110.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-114-768x282.png 768w\" sizes=\"(max-width: 524px) 100vw, 524px\" \/><\/figure><\/div>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-105.png\" alt=\"\" class=\"wp-image-25292\" width=\"659\" height=\"339\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-105.png 973w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-105-300x154.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-105-768x395.png 768w\" sizes=\"(max-width: 659px) 100vw, 659px\" \/><\/figure>\n\n\n\n<h3><a rel=\"noreferrer noopener\" href=\"https:\/\/zhuanlan.zhihu.com\/p\/651780908\" target=\"_blank\">PPO\u8bad\u7ec3\u6d41\u7a0b<\/a><\/h3>\n\n\n\n<ul><li>Actor Model\uff1a\u8981\u8bad\u7ec3\u7684\u76ee\u6807\u8bed\u8a00\u6a21\u578b\uff0c\u7b56\u7565\u7f51\u7edc<\/li><li>Critic Model\uff1a\u9884\u4f30\u603b\u6536\u76ca <\/li><li>Reward Model\uff1a\u8ba1\u7b97\u5373\u65f6\u6536\u76ca<\/li><li>Reference Model\uff1a\u5728RLHF\u9636\u6bb5\u7ed9\u8bed\u8a00\u6a21\u578b\u589e\u52a0\u4e00\u4e9b\u201c\u7ea6\u675f\u201d\uff0c\u9632\u6b62\u8bed\u8a00\u6a21\u578b\u8bad\u504f<\/li><\/ul>\n\n\n\n<p>ColossalChat RLFH\u8fc7\u7a0b\u4e5f\u662f\u975e\u5e38\u63a5\u8fd1ChatGPT\u7684RLFH\u8fc7\u7a0b\uff0cRLFH\u8fc7\u7a0b\u4e3b\u8981\u6d89\u53ca\u56db\u4e2a\u6a21\u578b\u5206\u522b\u662fActor\u3001Critic\u3001RM\u3001STF\uff0c\u635f\u5931\u51fd\u6570\u4e5f\u662f\u7531\u4e09\u4e2a\u635f\u5931\u51fd\u6570\u7ec4\u6210\u5206\u522b\u662f\u7b56\u7565\u635f\u5931\u3001\u4ef7\u503c\u635f\u5931\u548c PTX \u635f\u5931\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"1024\" height=\"445\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-131-1024x445.png\" alt=\"\" class=\"wp-image-25418\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-131-1024x445.png 1024w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-131-300x130.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-131-768x334.png 768w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-131.png 1066w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><figcaption>ColossalChat RLFH\u8fc7\u7a0b<\/figcaption><\/figure>\n\n\n\n<p><strong>\u7b56\u7565\u635f\u5931\u51fd\u6570\u8ba1\u7b97\uff1a<\/strong><\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"1024\" height=\"632\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-132-1024x632.png\" alt=\"\" class=\"wp-image-25419\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-132-1024x632.png 1024w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-132-300x185.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-132-768x474.png 768w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-132-825x510.png 825w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-132.png 1066w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><figcaption>\u7b56\u7565\u635f\u5931\u8ba1\u7b97\u8fc7\u7a0b<\/figcaption><\/figure>\n\n\n\n<p>\u901a\u8fc7instruction dataset\u6570\u636e\u8bad\u7ec3STF\u6a21\u578b\uff0c\u901a\u8fc7\u8ba1\u7b97sft model\u7684logits\u548cactor model\uff08\u6ca1\u6709\u7ecf\u8fc7sft\u7684model\uff09\u7684logits\u8ba1\u7b97kl\u6563\u5ea6\uff0c\u7136\u540e\u52a0\u4e0areward model\u7684\u6253\u5206\u53d8\u6210 reward R\u5956\u52b1\u503c\uff0c\u907f\u514d\u592a\u8fc7\u504f\u5411reward model\u52a0\u5165\u548csft model\u7684kl\u6563\u5ea6\uff0c\u540c\u65f6\u4e5f\u907f\u514d\u5f3a\u5316\u5b66\u4e60\u5c06actor\u6a21\u578b\u8bad\u6b6a\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-133.png\" alt=\"\" class=\"wp-image-25420\" width=\"459\" height=\"47\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-133.png 537w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-133-300x31.png 300w\" sizes=\"(max-width: 459px) 100vw, 459px\" \/><\/figure>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"919\" height=\"292\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-134.png\" alt=\"\" class=\"wp-image-25422\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-134.png 919w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-134-300x95.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-134-768x244.png 768w\" sizes=\"(max-width: 919px) 100vw, 919px\" \/><\/figure>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"447\" height=\"64\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-135.png\" alt=\"\" class=\"wp-image-25423\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-135.png 447w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-135-300x43.png 300w\" sizes=\"(max-width: 447px) 100vw, 447px\" \/><\/figure>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-136.png\" alt=\"\" class=\"wp-image-25425\" width=\"415\" height=\"247\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-136.png 652w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-136-300x179.png 300w\" sizes=\"(max-width: 415px) 100vw, 415px\" \/><\/figure>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"937\" height=\"373\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-137.png\" alt=\"\" class=\"wp-image-25426\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-137.png 937w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-137-300x119.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-137-768x306.png 768w\" sizes=\"(max-width: 937px) 100vw, 937px\" \/><\/figure>\n\n\n\n<p>\u8fd9\u6837\u505a\u7684\u76ee\u7684\u5c31\u662f\u907f\u514d\u6a21\u578b\u8bad\u98de\uff0c\u8ba9\u6a21\u578b\u66f4\u65b0\u4fdd\u6301\u5728\u4e00\u4e2a\u5c0f\u8303\u56f4\u5185\u3002<\/p>\n\n\n\n<p><strong>\u4ef7\u503c\u635f\u5931\u51fd\u6570\u8ba1\u7b97\uff1a<\/strong><\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"934\" height=\"336\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-138.png\" alt=\"\" class=\"wp-image-25427\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-138.png 934w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-138-300x108.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-138-768x276.png 768w\" sizes=\"(max-width: 934px) 100vw, 934px\" \/><\/figure>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-139.png\" alt=\"\" class=\"wp-image-25428\" width=\"203\" height=\"33\"\/><\/figure>\n\n\n\n<p>\u4e0a\u5f0fR\u662freward model\u548csft model\u8ba1\u7b97\u51fa\u6765\u7684\u53cd\u9988\u5206\u6570\uff0cV(s)\u662fCritic Model\u8f93\u51fa\u7684\u4ef7\u503c\u5206\u6570\u3002\u4e3b\u8981\u662f\u8861\u91cfreward\u5206\u6570\u548c\u4ef7\u503c\u51fd\u6570\u5206\u6570\u7684\u5747\u65b9\u8bef\u5dee\u3002<\/p>\n\n\n\n<p><strong>ptx\u7684\u635f\u5931\u8ba1\u7b97\uff1a<\/strong><\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"943\" height=\"237\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-140.png\" alt=\"\" class=\"wp-image-25429\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-140.png 943w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-140-300x75.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-140-768x193.png 768w\" sizes=\"(max-width: 943px) 100vw, 943px\" \/><\/figure>\n\n\n\n<p>\u8ba1\u7b97Actor\u8f93\u51faresponse\u548c\u8f93\u5165\u8bed\u6599\u7684\u56de\u7b54\u90e8\u5206\u7684\u4ea4\u53c9\u71b5\u635f\u5931\u51fd\u6570\uff0c\u7528\u6765\u5728PPO\u68af\u5ea6\u4e2d\u52a0\u5165\u9884\u8bad\u7ec3\u68af\u5ea6\uff0c\u4ee5\u4fdd\u6301\u8bed\u8a00\u6a21\u578b\u539f\u6709\u6027\u80fd\u9632\u6b62\u9057\u5fd8\u3002\u8fd9\u4e2a\u5c31\u662finstruct gpt\u8bba\u6587\u4e2d\u5728\u5f3a\u5316\u5b66\u4e60\u4e2d\u52a0\u5165\u9884\u8bad\u7ec3\u68af\u5ea6\u4ee5\u9632\u8fc7\u5ea6\u62df\u5408ppo\u6570\u636e\u5e26\u6765nlp\u901a\u7528\u4efb\u52a1\u80fd\u529b\u7684\u4e0b\u964d\u64cd\u4f5c\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-141.png\" alt=\"\" class=\"wp-image-25430\" width=\"195\" height=\"32\"\/><\/figure>\n\n\n\n<p><strong>\u603b\u7684\u5f3a\u5316\u5b66\u4e60\u635f\u5931\u8ba1\u7b97\uff1a<\/strong><\/p>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-142.png\" alt=\"\" class=\"wp-image-25431\" width=\"208\" height=\"32\"\/><\/figure>\n\n\n\n<h4><a href=\"https:\/\/www.cnblogs.com\/end\/p\/17481052.html\">\u4e3a\u4ec0\u4e48RLHF\u4e2d\uff0cPPO\u9700\u8981Critic\u6a21\u578b\u800c\u4e0d\u662f\u76f4\u63a5\u4f7f\u7528RewardModel<\/a><\/h4>\n\n\n\n<p>\u5728\u5f3a\u5316\u5b66\u4e60\u4e2d\uff0cPPO\uff08Proximal Policy Optimization\uff09\u7b97\u6cd5\u662f\u4e00\u79cd\u57fa\u4e8e\u7b56\u7565\u68af\u5ea6\u7684\u65b9\u6cd5\uff0c\u7528\u4e8e\u8bad\u7ec3\u5f3a\u5316\u5b66\u4e60\u667a\u80fd\u4f53\u3002<strong>PPO\u7b97\u6cd5\u4e2d\u5f15\u5165Critic\u6a21\u578b\u7684\u4e3b\u8981\u76ee\u7684\u662f\u4e3a\u4e86\u63d0\u4f9b\u4e00\u4e2a\u4ef7\u503c\u4f30\u8ba1\u5668\uff0c\u7528\u4e8e\u8bc4\u4f30\u72b6\u6001\u6216\u72b6\u6001\u52a8\u4f5c\u5bf9\u7684\u4ef7\u503c\uff0c\u4ece\u800c\u8f85\u52a9\u7b56\u7565\u7684\u66f4\u65b0\u548c\u4f18\u5316\u3002<\/strong><\/p>\n\n\n\n<p>\u867d\u7136<strong>\u5956\u52b1\u6a21\u578b\uff08Reward Model\uff09\u53ef\u4ee5\u63d0\u4f9b\u6bcf\u4e2a\u72b6\u6001\u6216\u72b6\u6001\u52a8\u4f5c\u5bf9\u7684\u5373\u65f6\u5956\u52b1\u4fe1\u53f7\uff0c\u4f46\u5b83\u5e76\u4e0d\u80fd\u76f4\u63a5\u63d0\u4f9b\u5bf9\u5e94\u7684\u4ef7\u503c\u4f30\u8ba1\u3002\u5956\u52b1\u4fe1\u53f7\u53ea\u53cd\u6620\u4e86\u5f53\u524d\u52a8\u4f5c\u7684\u5373\u65f6\u53cd\u9988\uff0c\u800c\u5e76\u6ca1\u6709\u63d0\u4f9b\u5173\u4e8e\u5728\u957f\u671f\u65f6\u95f4\u5c3a\u5ea6\u4e0a\u7684\u4ef7\u503c\u4fe1\u606f\u3002<\/strong><\/p>\n\n\n\n<p><strong>Critic\u6a21\u578b\u7684\u4f5c\u7528\u662f\u4f30\u8ba1\u72b6\u6001\u6216\u72b6\u6001\u52a8\u4f5c\u5bf9\u7684\u957f\u671f\u4ef7\u503c\uff0c\u4e5f\u79f0\u4e3a\u72b6\u6001\u503c\u51fd\u6570\u6216\u52a8\u4f5c\u503c\u51fd\u6570\u3002<\/strong>Critic\u6a21\u578b\u80fd\u591f\u5b66\u4e60\u548c\u9884\u6d4b\u5728\u5f53\u524d\u72b6\u6001\u4e0b\u91c7\u53d6\u4e0d\u540c\u52a8\u4f5c\u6240\u83b7\u5f97\u7684\u7d2f\u79ef\u5956\u52b1\uff0c\u5b83\u63d0\u4f9b\u4e86\u5bf9\u7b56\u7565\u6539\u8fdb\u7684\u6307\u5bfc\u3002PPO\u7b97\u6cd5\u4f7f\u7528Critic\u6a21\u578b\u7684\u4f30\u8ba1\u503c\u6765\u8ba1\u7b97\u4f18\u52bf\u51fd\u6570\uff0c\u4ece\u800c\u8c03\u6574\u7b56\u7565\u7684\u66f4\u65b0\u5e45\u5ea6\uff0c\u4f7f\u5f97\u66f4\u6709\u5229\u4e8e\u4ea7\u751f\u66f4\u9ad8\u957f\u671f\u56de\u62a5\u7684\u52a8\u4f5c\u88ab\u9009\u62e9\u3002<\/p>\n\n\n\n<p>\u53e6\u5916\uff0cCritic\u6a21\u578b\u8fd8\u53ef\u4ee5\u7528\u4e8e\u8bc4\u4f30\u4e0d\u540c\u7b56\u7565\u7684\u6027\u80fd\uff0c\u4e3a\u6a21\u578b\u7684\u8bc4\u4f30\u548c\u9009\u62e9\u63d0\u4f9b\u4f9d\u636e\u3002PPO\u7b97\u6cd5\u4e2d\u7684Actor-Critic\u67b6\u6784\u5141\u8bb8\u667a\u80fd\u4f53\u540c\u65f6\u5b66\u4e60\u7b56\u7565\u548c\u4ef7\u503c\u51fd\u6570\uff0c\u5e76\u901a\u8fc7\u534f\u540c\u8bad\u7ec3\u6765\u63d0\u9ad8\u6027\u80fd\u3002<\/p>\n\n\n\n<p>\u56e0\u6b64\uff0c\u5728RLHF\uff08Reinforcement Learning from Human Feedback\uff09\u4e2d\uff0cPPO\u7b97\u6cd5\u9700\u8981Critic\u6a21\u578b\u800c\u4e0d\u662f\u76f4\u63a5\u4f7f\u7528\u5956\u52b1\u6a21\u578b\uff0c\u662f\u4e3a\u4e86\u63d0\u4f9b\u5bf9\u72b6\u6001\u6216\u72b6\u6001\u52a8\u4f5c\u5bf9\u7684\u4ef7\u503c\u4f30\u8ba1\uff0c\u5e76\u652f\u6301\u7b56\u7565\u7684\u6539\u8fdb\u548c\u4f18\u5316\u3002Critic\u6a21\u578b\u7684\u5f15\u5165\u53ef\u4ee5\u63d0\u4f9b\u66f4\u5168\u9762\u548c\u51c6\u786e\u7684\u4fe1\u606f\uff0c\u4ece\u800c\u589e\u5f3a\u7b97\u6cd5\u7684\u8bad\u7ec3\u6548\u679c\u548c\u5b66\u4e60\u80fd\u529b\u3002<\/p>\n\n\n\n<p><strong>\u5373\u65f6\u5956\u52b1 \u4e0e \u72b6\u6001\u52a8\u4f5c\u5bf9\u7684\u957f\u671f\u4ef7\u503c \u7684\u5dee\u522b\u662f\u4ec0\u4e48\uff1f<\/strong><\/p>\n\n\n\n<p>\u5373\u65f6\u5956\u52b1\uff08Immediate Reward\uff09\u548c\u72b6\u6001\u52a8\u4f5c\u5bf9\u7684\u957f\u671f\u4ef7\u503c\uff08Long-Term Value\uff09\u4ee3\u8868\u4e86\u5f3a\u5316\u5b66\u4e60\u4e2d\u4e0d\u540c\u7684\u6982\u5ff5\u548c\u65f6\u95f4\u5c3a\u5ea6\u3002<\/p>\n\n\n\n<p>\u5373\u65f6\u5956\u52b1\u662f\u6307\u667a\u80fd\u4f53\u5728\u6267\u884c\u67d0\u4e2a\u52a8\u4f5c\u540e\u7acb\u5373\u83b7\u5f97\u7684\u53cd\u9988\u4fe1\u53f7\u3002\u5b83\u901a\u5e38\u7531\u73af\u5883\u63d0\u4f9b\uff0c\u7528\u4e8e\u8868\u793a\u5f53\u524d\u52a8\u4f5c\u7684\u597d\u574f\u7a0b\u5ea6\u3002\u5373\u65f6\u5956\u52b1\u662f\u4e00\u79cd\u5373\u65f6\u53cd\u9988\uff0c\u53ef\u4ee5\u6307\u793a\u5f53\u524d\u52a8\u4f5c\u7684\u7acb\u5373\u7ed3\u679c\u662f\u5426\u7b26\u5408\u667a\u80fd\u4f53\u7684\u76ee\u6807\u3002<\/p>\n\n\n\n<p>\u800c\u72b6\u6001\u52a8\u4f5c\u5bf9\u7684\u957f\u671f\u4ef7\u503c\u6d89\u53ca\u66f4\u957f\u65f6\u95f4\u5c3a\u5ea6\u4e0a\u7684\u8bc4\u4f30\uff0c\u5b83\u8003\u8651\u4e86\u667a\u80fd\u4f53\u5728\u5f53\u524d\u72b6\u6001\u4e0b\u9009\u62e9\u4e0d\u540c\u52a8\u4f5c\u6240\u5bfc\u81f4\u7684\u672a\u6765\u56de\u62a5\u7684\u7d2f\u79ef\u3002\u957f\u671f\u4ef7\u503c\u53ef\u4ee5\u8868\u793a\u4e3a\u72b6\u6001\u503c\u51fd\u6570\uff08State Value Function\uff09\u6216\u52a8\u4f5c\u503c\u51fd\u6570\uff08Action Value Function\uff09\u3002<\/p>\n\n\n\n<p><strong>\u72b6\u6001\u503c\u51fd\u6570\uff08V-function\uff09<\/strong>\u8868\u793a\u5728\u7ed9\u5b9a\u72b6\u6001\u4e0b\uff0c\u667a\u80fd\u4f53\u4ece\u8be5\u72b6\u6001\u5f00\u59cb\u6267\u884c\u4e00\u7cfb\u5217\u52a8\u4f5c\uff0c\u7136\u540e\u6309\u7167\u67d0\u4e2a\u7b56\u7565\u8fdb\u884c\u51b3\u7b56\uff0c\u4ece\u800c\u83b7\u5f97\u7684\u9884\u671f\u7d2f\u79ef\u56de\u62a5\u3002<strong>\u72b6\u6001\u503c\u51fd\u6570\u4f30\u8ba1\u4e86\u667a\u80fd\u4f53\u5904\u4e8e\u67d0\u4e2a\u72b6\u6001\u65f6\u6240\u80fd\u83b7\u5f97\u7684\u957f\u671f\u4ef7\u503c\uff0c\u53cd\u6620\u4e86\u72b6\u6001\u7684\u4f18\u52a3\u7a0b\u5ea6<\/strong>\u3002<\/p>\n\n\n\n<p><strong>\u52a8\u4f5c\u503c\u51fd\u6570\uff08Q-function\uff09<\/strong>\u5219\u8868\u793a\u5728\u7ed9\u5b9a\u72b6\u6001\u4e0b\uff0c\u667a\u80fd\u4f53\u9009\u62e9\u67d0\u4e2a\u52a8\u4f5c\u540e\uff0c\u6309\u7167\u67d0\u4e2a\u7b56\u7565\u8fdb\u884c\u51b3\u7b56\uff0c\u4ece\u8be5\u72b6\u6001\u8f6c\u79fb\u5230\u4e0b\u4e00\u4e2a\u72b6\u6001\u5e76\u83b7\u5f97\u9884\u671f\u7d2f\u79ef\u56de\u62a5\u7684\u4ef7\u503c\u3002<strong>\u52a8\u4f5c\u503c\u51fd\u6570\u4f30\u8ba1\u4e86\u5728\u7ed9\u5b9a\u72b6\u6001\u4e0b\u91c7\u53d6\u4e0d\u540c\u52a8\u4f5c\u7684\u957f\u671f\u4ef7\u503c\uff0c\u53ef\u4ee5\u5e2e\u52a9\u667a\u80fd\u4f53\u9009\u62e9\u5728\u6bcf\u4e2a\u72b6\u6001\u4e0b\u6700\u4f18\u7684\u52a8\u4f5c<\/strong>\u3002<\/p>\n\n\n\n<p>\u957f\u671f\u4ef7\u503c\u8003\u8651\u4e86\u667a\u80fd\u4f53\u5728\u672a\u6765\u7684\u51b3\u7b56\u8fc7\u7a0b\u4e2d\u6240\u80fd\u83b7\u5f97\u7684\u7d2f\u79ef\u56de\u62a5\uff0c\u76f8\u6bd4\u4e4b\u4e0b\uff0c\u5373\u65f6\u5956\u52b1\u53ea\u63d0\u4f9b\u4e86\u5f53\u524d\u52a8\u4f5c\u7684\u5373\u65f6\u53cd\u9988\u3002\u957f\u671f\u4ef7\u503c\u5bf9\u667a\u80fd\u4f53\u7684\u51b3\u7b56\u5177\u6709\u66f4\u5168\u9762\u7684\u5f71\u54cd\uff0c\u53ef\u4ee5\u5e2e\u52a9\u667a\u80fd\u4f53\u66f4\u597d\u5730\u8bc4\u4f30\u5f53\u524d\u72b6\u6001\u548c\u52a8\u4f5c\u7684\u957f\u671f\u6548\u679c\uff0c\u5e76\u6307\u5bfc\u667a\u80fd\u4f53\u5728\u957f\u671f\u65f6\u95f4\u5c3a\u5ea6\u4e0a\u4f5c\u51fa\u66f4\u4f18\u7684\u51b3\u7b56\u3002<\/p>\n\n\n\n<p>\u5728\u5f3a\u5316\u5b66\u4e60\u4e2d\uff0c\u957f\u671f\u4ef7\u503c\u7684\u4f30\u8ba1\u5bf9\u4e8e\u786e\u5b9a\u6027\u7b56\u7565\u9009\u62e9\u548c\u4ef7\u503c\u4f18\u5316\u975e\u5e38\u91cd\u8981\uff0c\u800c\u5373\u65f6\u5956\u52b1\u5219\u63d0\u4f9b\u4e86\u5bf9\u5f53\u524d\u52a8\u4f5c\u7684\u76f4\u63a5\u53cd\u9988\u3002\u8fd9\u4e24\u8005\u76f8\u4e92\u8865\u5145\uff0c\u7ed3\u5408\u8d77\u6765\u53ef\u4ee5\u5e2e\u52a9\u667a\u80fd\u4f53\u5b9e\u73b0\u66f4\u597d\u7684\u51b3\u7b56\u548c\u5b66\u4e60\u6548\u679c\u3002<\/p>\n\n\n\n<p><strong>PPO\u4e2d\u4f18\u52bf\u51fd\u6570\u6307\u4ec0\u4e48<\/strong><\/p>\n\n\n\n<p>\u5728Proximal Policy Optimization\uff08PPO\uff09\u7b97\u6cd5\u4e2d\uff0c<strong>\u4f18\u52bf\u51fd\u6570\uff08Advantage Function\uff09\u7528\u4e8e\u8bc4\u4f30\u72b6\u6001-\u52a8\u4f5c\u5bf9\u7684\u76f8\u5bf9\u4f18\u52a3\u7a0b\u5ea6\u3002\u5b83\u8861\u91cf\u4e86\u6267\u884c\u67d0\u4e2a\u52a8\u4f5c\u76f8\u5bf9\u4e8e\u5e73\u5747\u6c34\u5e73\u7684\u4f18\u52a3\uff0c\u5373\u5728\u7ed9\u5b9a\u72b6\u6001\u4e0b\u91c7\u53d6\u67d0\u4e2a\u52a8\u4f5c\u76f8\u5bf9\u4e8e\u91c7\u53d6\u5e73\u5747\u52a8\u4f5c\u7684\u6548\u679c\u3002<\/strong><\/p>\n\n\n\n<p>\u4f18\u52bf\u51fd\u6570\u53ef\u4ee5\u7528\u4ee5\u4e0b\u65b9\u5f0f\u5b9a\u4e49\uff1a<code>Advantage(s, a) =&nbsp;Q(s, a) -&nbsp;V(s)<\/code><\/p>\n\n\n\n<p>\u5176\u4e2d\uff0c<code>Advantage(s, a)<\/code>\u8868\u793a\u5728\u72b6\u6001&nbsp;<code>s<\/code>&nbsp;\u4e0b\u91c7\u53d6\u52a8\u4f5c&nbsp;<code>a<\/code>&nbsp;\u7684\u4f18\u52bf\u51fd\u6570\u503c\uff0c<code>Q(s, a)<\/code>&nbsp;\u8868\u793a\u72b6\u6001\u52a8\u4f5c\u5bf9&nbsp;<code>(s, a)<\/code>&nbsp;\u7684\u52a8\u4f5c\u503c\u51fd\u6570\uff08\u4e5f\u79f0\u4e3a\u52a8\u4f5c\u4f18\u52bf\u51fd\u6570\uff09\uff0c<code>V(s)<\/code>&nbsp;\u8868\u793a\u72b6\u6001\u503c\u51fd\u6570\u3002<\/p>\n\n\n\n<p><strong>\u4f18\u52bf\u51fd\u6570\u7684\u4f5c\u7528\u5728\u4e8e\u5e2e\u52a9\u8bc4\u4f30\u5f53\u524d\u52a8\u4f5c\u7684\u76f8\u5bf9\u4ef7\u503c\uff0c\u4ee5\u4fbf\u5728\u7b56\u7565\u66f4\u65b0\u8fc7\u7a0b\u4e2d\u786e\u5b9a\u5e94\u91c7\u53d6\u7684\u52a8\u4f5c\u3002\u901a\u8fc7\u6bd4\u8f83\u4e0d\u540c\u52a8\u4f5c\u7684\u4f18\u52bf\u51fd\u6570\u503c\uff0c\u53ef\u4ee5\u51b3\u5b9a\u54ea\u4e9b\u52a8\u4f5c\u662f\u66f4\u597d\u7684\u9009\u62e9\u3002\u6b63\u7684\u4f18\u52bf\u51fd\u6570\u503c\u8868\u793a\u6267\u884c\u7684\u52a8\u4f5c\u6bd4\u5e73\u5747\u6c34\u5e73\u66f4\u597d\uff0c\u800c\u8d1f\u7684\u4f18\u52bf\u51fd\u6570\u503c\u8868\u793a\u6267\u884c\u7684\u52a8\u4f5c\u6bd4\u5e73\u5747\u6c34\u5e73\u66f4\u5dee\u3002<\/strong><\/p>\n\n\n\n<p><strong>\u5728PPO\u7b97\u6cd5\u4e2d\uff0c\u4f18\u52bf\u51fd\u6570\u7528\u4e8e\u8ba1\u7b97\u7b56\u7565\u66f4\u65b0\u7684\u76ee\u6807\uff0c\u4ee5\u4fbf\u8c03\u6574\u7b56\u7565\u6982\u7387\u5206\u5e03\u6765\u63d0\u9ad8\u4f18\u52bf\u51fd\u6570\u4e3a\u6b63\u7684\u52a8\u4f5c\u7684\u6982\u7387\uff0c\u5e76\u964d\u4f4e\u4f18\u52bf\u51fd\u6570\u4e3a\u8d1f\u7684\u52a8\u4f5c\u7684\u6982\u7387\uff0c\u4ece\u800c\u6539\u8fdb\u7b56\u7565\u7684\u6027\u80fd\u3002<\/strong><\/p>\n\n\n\n<p>\u603b\u800c\u8a00\u4e4b\uff0c\u4f18\u52bf\u51fd\u6570\u5728PPO\u7b97\u6cd5\u4e2d\u7528\u4e8e\u8bc4\u4f30\u72b6\u6001-\u52a8\u4f5c\u5bf9\u7684\u76f8\u5bf9\u4f18\u52a3\uff0c\u5e2e\u52a9\u786e\u5b9a\u5e94\u8be5\u91c7\u53d6\u7684\u52a8\u4f5c\uff0c\u5e76\u5728\u7b56\u7565\u66f4\u65b0\u8fc7\u7a0b\u4e2d\u5f15\u5bfc\u7b56\u7565\u5411\u66f4\u4f18\u7684\u65b9\u5411\u8c03\u6574\u3002<\/p>\n\n\n\n<h2><a rel=\"noreferrer noopener\" href=\"https:\/\/zhuanlan.zhihu.com\/p\/21046265072\" target=\"_blank\">GRPO (Group Relative Policy Optimization)<\/a><\/h2>\n\n\n\n<p>\u4f20\u7edf\u7684\u5f3a\u5316\u5b66\u4e60\u7b97\u6cd5\uff08\u5982Proximal Policy Optimization\uff0cPPO\uff09\u5728\u5e94\u7528\u4e8eLLMs\u7684\u63a8\u7406\u4efb\u52a1\u65f6\u9762\u4e34\u7740\u91cd\u5927\u6311\u6218\uff1a<\/p>\n\n\n\n<ol><li><strong>\u4f9d\u8d56\u6279\u8bc4\u8005\u6a21\u578b\uff1a<\/strong><\/li><\/ol>\n\n\n\n<ul><li>PPO\u9700\u8981\u4e00\u4e2a\u72ec\u7acb\u7684\u6279\u8bc4\u8005\u6a21\u578b\u6765\u8bc4\u4f30\u6bcf\u4e2a\u56de\u7b54\u7684\u4ef7\u503c\uff0c\u8fd9\u4f7f\u5185\u5b58\u548c\u8ba1\u7b97\u9700\u6c42\u589e\u52a0\u4e86\u4e00\u500d\u3002<\/li><li>\u8bad\u7ec3\u6279\u8bc4\u8005\u6a21\u578b\u975e\u5e38\u590d\u6742\u4e14\u5bb9\u6613\u51fa\u9519\uff0c\u5c24\u5176\u662f\u5728\u9700\u8981\u5bf9\u4e3b\u89c2\u6216\u7ec6\u5fae\u5dee\u522b\u8fdb\u884c\u8bc4\u4ef7\u7684\u4efb\u52a1\u4e2d\u3002<\/li><\/ul>\n\n\n\n<p><strong>2. \u9ad8\u6602\u7684\u8ba1\u7b97\u6210\u672c\uff1a<\/strong><\/p>\n\n\n\n<ul><li>\u5f3a\u5316\u5b66\u4e60\u6d41\u7a0b\u901a\u5e38\u9700\u8981\u5927\u91cf\u8ba1\u7b97\u8d44\u6e90\u6765\u8fed\u4ee3\u8bc4\u4f30\u548c\u4f18\u5316\u56de\u7b54\u3002<\/li><li>\u5c06\u8fd9\u4e9b\u65b9\u6cd5\u6269\u5c55\u5230\u66f4\u5927\u7684LLMs\u4f1a\u8fdb\u4e00\u6b65\u52a0\u5267\u6210\u672c\u3002<\/li><\/ul>\n\n\n\n<p><strong>3. \u53ef\u6269\u5c55\u6027\u95ee\u9898\uff1a<\/strong><\/p>\n\n\n\n<ul><li>\u7edd\u5bf9\u5956\u52b1\u8bc4\u4f30\u96be\u4ee5\u5e94\u5bf9\u591a\u6837\u5316\u4efb\u52a1\uff0c\u4f7f\u5f97\u8de8\u63a8\u7406\u9886\u57df\u7684\u6cdb\u5316\u53d8\u5f97\u56f0\u96be\u3002<\/li><\/ul>\n\n\n\n<p><strong>GRPO\u5982\u4f55\u5e94\u5bf9\u8fd9\u4e9b\u6311\u6218\uff1a<\/strong><\/p>\n\n\n\n<ul><li><strong>\u65e0\u6279\u8bc4\u8005\u4f18\u5316\uff1a<\/strong>&nbsp;GRPO\u901a\u8fc7\u6bd4\u8f83\u7ec4\u5185\u56de\u7b54\uff0c\u6d88\u9664\u4e86\u5bf9\u6279\u8bc4\u8005\u6a21\u578b\u7684\u9700\u6c42\uff0c\u663e\u8457\u964d\u4f4e\u4e86\u8ba1\u7b97\u5f00\u9500\u3002<\/li><li><strong>\u76f8\u5bf9\u8bc4\u4f30\uff1a<\/strong>&nbsp;GRPO\u4e0d\u4f9d\u8d56\u5916\u90e8\u8bc4\u4ef7\u8005\uff0c\u800c\u662f\u5229\u7528\u7ec4\u5185\u52a8\u6001\u6765\u8bc4\u4f30\u6bcf\u4e2a\u56de\u7b54\u5728\u540c\u4e00\u6279\u6b21\u4e2d\u7684\u76f8\u5bf9\u8868\u73b0\u3002<\/li><li><strong>\u9ad8\u6548\u8bad\u7ec3\uff1a<\/strong>&nbsp;\u901a\u8fc7\u4e13\u6ce8\u4e8e\u7ec4\u5185\u4f18\u52bf\uff0cGRPO\u7b80\u5316\u4e86\u5956\u52b1\u4f30\u8ba1\u6d41\u7a0b\uff0c\u4f7f\u5176\u5bf9\u5927\u578b\u6a21\u578b\u7684\u8bad\u7ec3\u66f4\u5feb\u4e14\u66f4\u5177\u53ef\u6269\u5c55\u6027\u3002<\/li><\/ul>\n\n\n\n<p>GRPO\u7684\u6838\u5fc3\u601d\u60f3\u662f\u901a\u8fc7<strong>\u7ec4\u5185\u76f8\u5bf9\u5956\u52b1<\/strong>\u6765\u4f30\u8ba1\u57fa\u7ebf\uff08baseline\uff09\uff0c\u4ece\u800c\u907f\u514d\u4f7f\u7528\u989d\u5916\u7684<strong>\u4ef7\u503c\u51fd\u6570\u6a21\u578b\uff08critic model\uff09<\/strong>\u3002\u4f20\u7edf\u7684PPO\u7b97\u6cd5\u9700\u8981\u8bad\u7ec3\u4e00\u4e2a\u4ef7\u503c\u51fd\u6570\u6765\u4f30\u8ba1<strong>\u4f18\u52bf\u51fd\u6570<\/strong>\uff08advantage function\uff09\uff0c\u800cGRPO\u901a\u8fc7\u4ece\u540c\u4e00\u95ee\u9898\u7684\u591a\u4e2a\u8f93\u51fa\u4e2d\u8ba1\u7b97\u5e73\u5747\u5956\u52b1\u6765\u66ff\u4ee3\u8fd9\u4e00\u8fc7\u7a0b\uff0c\u663e\u8457\u51cf\u5c11\u4e86\u5185\u5b58\u548c\u8ba1\u7b97\u8d44\u6e90\u7684\u6d88\u8017\u3002<\/p>\n\n\n\n<p><strong>Group Relative Policy Optimization (GRPO)<\/strong>\uff0c\u4e0d\u518d\u9700\u8981\u50cfPPO\u90a3\u6837\u52a0\u5165\u989d\u5916\u7684\u4ef7\u503c\u51fd\u6570\u8fd1\u4f3c<strong><em>\uff0c\u800c\u662f\u76f4\u63a5\u4f7f\u7528\u591a\u4e2a\u91c7\u6837\u8f93\u51fa\u7684\u5e73\u5747\u5956\u52b1\u4f5c\u4e3aBaseline<\/em><\/strong>\uff0c\u663e\u8457\u51cf\u5c11\u4e86\u8bad\u7ec3\u8d44\u6e90\u7684\u4f7f\u7528\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"955\" height=\"418\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/05\/image-37.png\" alt=\"\" class=\"wp-image-26452\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/05\/image-37.png 955w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/05\/image-37-300x131.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/05\/image-37-768x336.png 768w\" sizes=\"(max-width: 955px) 100vw, 955px\" \/><\/figure>\n\n\n\n<p>\u5177\u4f53\u6765\u8bf4\uff0c\u5bf9\u4e8e\u6bcf\u4e2a\u95ee\u9898&nbsp;<em>i<\/em>\uff0cGRPO \u4ece\u65e7\u7b56\u7565&nbsp;<em>\u03c0\u03b8old<\/em>\u200b\u200b \u4e2d\u91c7\u6837\u4e00\u7ec4\u8f93\u51fa {<em>i<\/em>1\u200b,<em>i<\/em>2\u200b,\u2026,<em>iA<\/em>\u200b}\uff0c\u7136\u540e\u901a\u8fc7\u6700\u5927\u5316\u4ee5\u4e0b\u76ee\u6807\u51fd\u6570\u6765\u4f18\u5316\u7b56\u7565\u6a21\u578b\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"985\" height=\"123\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/05\/image-38.png\" alt=\"\" class=\"wp-image-26454\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/05\/image-38.png 985w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/05\/image-38-300x37.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/05\/image-38-768x96.png 768w\" sizes=\"(max-width: 985px) 100vw, 985px\" \/><\/figure>\n\n\n\n<p>\u5176\u4e2d\uff0c<em>\u03f5<\/em>&nbsp;\u548c&nbsp;<em>\u03b2<\/em>&nbsp;\u662f\u8d85\u53c2\u6570\uff0c<em>A<\/em>^<em>i<\/em>,<em>j<\/em>\u200b \u662f\u57fa\u4e8e\u7ec4\u5185\u5956\u52b1\u7684\u76f8\u5bf9\u4f18\u52bf\u4f30\u8ba1\u3002\u4e0e PPO \u4e0d\u540c\uff0c<strong>GRPO \u901a\u8fc7\u76f4\u63a5\u4f7f\u7528\u5956\u52b1\u6a21\u578b\u7684\u8f93\u51fa\u6765\u4f30\u8ba1\u57fa\u7ebf\uff0c\u907f\u514d\u4e86\u8bad\u7ec3\u4e00\u4e2a\u590d\u6742\u7684\u503c\u51fd\u6570<\/strong>\u3002\u6b64\u5916\uff0cGRPO \u901a\u8fc7\u76f4\u63a5\u5728\u635f\u5931\u51fd\u6570\u4e2d\u52a0\u5165\u7b56\u7565\u6a21\u578b\u548c\u53c2\u8003\u6a21\u578b\u4e4b\u95f4\u7684 KL \u6563\u5ea6\u6765\u6b63\u5219\u5316\uff0c\u800c\u4e0d\u662f\u5728\u5956\u52b1\u4e2d\u52a0\u5165 KL \u60e9\u7f5a\u9879\uff0c\u4ece\u800c\u7b80\u5316\u4e86\u8bad\u7ec3\u8fc7\u7a0b\u3002<\/p>\n\n\n\n<p>\u6b64\u5916\uff0cGRPO \u901a\u8fc7\u76f4\u63a5\u5728\u635f\u5931\u51fd\u6570\u4e2d\u52a0\u5165\u7b56\u7565\u6a21\u578b\u548c\u53c2\u8003\u6a21\u578b\u4e4b\u95f4\u7684 KL \u6563\u5ea6\u6765\u6b63\u5219\u5316\uff0c\u800c\u4e0d\u662f\u5728\u5956\u52b1\u4e2d\u52a0\u5165 KL \u60e9\u7f5a\u9879\uff0c\u4ece\u800c\u7b80\u5316\u4e86\u8bad\u7ec3\u8fc7\u7a0b\u3002<\/p>\n\n\n\n<p><strong>GRPO\u7684\u8ba1\u7b97\u6d41\u7a0b\u5305\u62ec\uff1a<\/strong><\/p>\n\n\n\n<ol><li>\u91c7\u6837\u4e00\u7ec4\u8f93\u51fa\u5e76\u8ba1\u7b97\u6bcf\u4e2a\u8f93\u51fa\u7684\u5956\u52b1\u3002<\/li><li>\u5bf9\u7ec4\u5185\u5956\u52b1\u8fdb\u884c\u5f52\u4e00\u5316\u5904\u7406\u3002<\/li><li><strong>\u4f7f\u7528\u5f52\u4e00\u5316\u540e\u7684\u5956\u52b1\u8ba1\u7b97\u4f18\u52bf\u51fd\u6570\u3002<\/strong><\/li><li>\u901a\u8fc7\u6700\u5927\u5316\u76ee\u6807\u51fd\u6570\u66f4\u65b0\u7b56\u7565\u6a21\u578b\u3002<\/li><li>\u8fed\u4ee3\u8bad\u7ec3\uff0c\u9010\u6b65\u4f18\u5316\u7b56\u7565\u6a21\u578b\u3002<\/li><\/ol>\n\n\n\n<p>GRPO\u901a\u8fc7\u7ec4\u5185\u76f8\u5bf9\u5956\u52b1\u4f30\u8ba1\u57fa\u7ebf\uff0c\u907f\u514d\u4e86\u4f20\u7edfPPO\u4e2d\u4ef7\u503c\u51fd\u6570\u7684\u4f7f\u7528\uff0c\u663e\u8457\u51cf\u5c11\u4e86\u8bad\u7ec3\u8d44\u6e90\u6d88\u8017\uff0c\u540c\u65f6\u63d0\u5347\u4e86\u6a21\u578b\u5728\u6570\u5b66\u63a8\u7406\u7b49\u590d\u6742\u4efb\u52a1\u4e2d\u7684\u8868\u73b0\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img src=\"https:\/\/antarina.tech\/assets\/img\/rlhf_2\/image-20240804164951112.png\" alt=\"image-20240804164951112\"\/><\/figure>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"1024\" height=\"562\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-84-1024x562.png\" alt=\"\" class=\"wp-image-27228\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-84-1024x562.png 1024w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-84-300x165.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-84-768x422.png 768w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-84.png 1111w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p><em><strong>GRPO \u8ba1\u7b97\u603b\u7ed3<\/strong><\/em>\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"838\" height=\"421\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-32.png\" alt=\"\" class=\"wp-image-26901\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-32.png 838w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-32-300x151.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-32-768x386.png 768w\" sizes=\"(max-width: 838px) 100vw, 838px\" \/><\/figure>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"861\" height=\"594\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-33.png\" alt=\"\" class=\"wp-image-26903\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-33.png 861w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-33-300x207.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-33-768x530.png 768w\" sizes=\"(max-width: 861px) 100vw, 861px\" \/><\/figure>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"877\" height=\"520\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-34.png\" alt=\"\" class=\"wp-image-26906\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-34.png 877w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-34-300x178.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-34-768x455.png 768w\" sizes=\"(max-width: 877px) 100vw, 877px\" \/><\/figure>\n\n\n\n<p class=\"has-light-pink-background-color has-background\"><strong>GRPO\u7684\u6838\u5fc3\u601d\u60f3\u662f\u76f8\u5bf9\u8bc4\u4f30\uff1a<\/strong><\/p>\n\n\n\n<ol><li>\u5bf9\u4e8e\u6bcf\u4e2a\u8f93\u5165\u67e5\u8be2\uff0c\u6a21\u578b\u751f\u6210\u4e00\u7ec4\u6f5c\u5728\u56de\u7b54\u3002<\/li><li>\u6839\u636e\u6bcf\u4e2a\u56de\u7b54\u5728\u7ec4\u4e2d\u7684\u76f8\u5bf9\u8868\u73b0\u8fdb\u884c\u8bc4\u5206\uff0c\u800c\u4e0d\u662f\u5b64\u7acb\u5730\u8bc4\u4f30\u5355\u4e2a\u56de\u7b54\u3002<\/li><li>\u4e00\u4e2a\u56de\u7b54\u7684\u4f18\u52bf\u53cd\u6620\u4e86\u5176\u76f8\u5bf9\u4e8e\u7ec4\u5185\u5e73\u5747\u8868\u73b0\u7684\u4f18\u52a3\u7a0b\u5ea6\u3002<\/li><\/ol>\n\n\n\n<p>\u8fd9\u79cd\u65b9\u6cd5\u6d88\u9664\u4e86\u5bf9\u72ec\u7acb\u6279\u8bc4\u8005\u6a21\u578b\u7684\u9700\u6c42\uff0c\u4f7f<strong>GRPO\u65e2\u9ad8\u6548\u53c8\u7a33\u5065<\/strong>\u3002<strong>\u901a\u8fc7\u5728\u7ec4\u5185\u5f15\u5165\u7ade\u4e89\uff0cGRPO\u63a8\u52a8\u6a21\u578b\u4e0d\u65ad\u63d0\u5347\u5176\u63a8\u7406\u80fd\u529b<\/strong>\u3002\u6b63\u662f\u8fd9\u4e00\u521b\u65b0\u4f7fDeepSeek\u5728\u63a8\u7406\u4efb\u52a1\u4e2d\u53d6\u5f97\u4e86\u5353\u8d8a\u7684\u6210\u679c\u3002<\/p>\n\n\n\n<h3><strong>\u4ee5\u7b80\u5355\u7684\u65b9\u5f0f\u7406\u89e3GRPO\u76ee\u6807\u51fd\u6570<\/strong><\/h3>\n\n\n\n<p>GRPO\uff08Group Relative Policy Optimization\uff0c\u7fa4\u4f53\u76f8\u5bf9\u7b56\u7565\u4f18\u5316\uff09\u7684\u76ee\u6807\u51fd\u6570\u5c31\u50cf\u662f\u4e00\u79cd\u201c\u98df\u8c31\u201d\uff0c\u901a\u8fc7\u6bd4\u8f83\u6a21\u578b\u7684\u56de\u7b54\u5e76\u9010\u6b65\u6539\u8fdb\uff0c\u6559\u4f1a\u6a21\u578b\u751f\u6210\u66f4\u597d\u7684\u7b54\u6848\u3002\u8ba9\u6211\u4eec\u7528\u4e00\u4e2a\u6613\u4e8e\u7406\u89e3\u7684\u65b9\u5f0f\u9010\u6b65\u89e3\u6790\u5b83\uff1a<\/p>\n\n\n\n<p><strong>\u76ee\u6807<\/strong>\uff1a\u5047\u8bbe\u4f60\u6b63\u5728\u6559\u4e00\u7ec4\u5b66\u751f\u89e3\u51b3\u4e00\u4e2a\u6570\u5b66\u95ee\u9898\u3002\u4f60\u4e0d\u662f\u5355\u7eaf\u544a\u8bc9\u4ed6\u4eec\u8c01\u5bf9\u8c01\u9519\uff0c\u800c\u662f\u901a\u8fc7\u6bd4\u8f83\u6240\u6709\u5b66\u751f\u7684\u7b54\u6848\uff0c\u627e\u51fa\u8c01\u505a\u5f97\u6700\u597d\uff08\u4ee5\u53ca\u539f\u56e0\uff09\u3002\u7136\u540e\uff0c\u4f60\u901a\u8fc7\u5956\u52b1\u66f4\u597d\u7684\u65b9\u6cd5\u5e76\u6539\u8fdb\u8f83\u5f31\u7684\u65b9\u6cd5\u6765\u5e2e\u52a9\u4ed6\u4eec\u5b66\u4e60\u3002\u8fd9\u6b63\u662fGRPO\u6240\u505a\u7684\u2014\u2014\u53ea\u4e0d\u8fc7\u5b83\u6559\u7684\u662fAI\u6a21\u578b\uff0c\u800c\u4e0d\u662f\u5b66\u751f\u3002<\/p>\n\n\n\n<p><strong>\u9010\u6b65\u89e3\u6790<\/strong><\/p>\n\n\n\n<p><strong>\u7b2c\u4e00\u6b65\uff1a\u4ece\u67e5\u8be2\u5f00\u59cb<\/strong><\/p>\n\n\n\n<p>\u4ece\u8bad\u7ec3\u6570\u636e\u96c6 P(Q) \u4e2d\u9009\u53d6\u4e00\u4e2a\u67e5\u8be2 (q)\u3002<br>\u4f8b\u5982\uff1a\u5047\u8bbe\u67e5\u8be2\u662f\u201c8 + 5 \u7684\u548c\u662f\u591a\u5c11\uff1f\u201d<\/p>\n\n\n\n<p><strong>\u7b2c\u4e8c\u6b65\uff1a\u751f\u6210\u4e00\u7ec4\u56de\u7b54<\/strong><\/p>\n\n\n\n<p>\u6a21\u578b\u751f\u6210\u4e00\u7ec4 G\uff084\uff09 \u4e2a\u56de\u7b54\u6765\u5e94\u5bf9\u67e5\u8be2\u3002<br>\u4f8b\u5982\uff1a\u6a21\u578b\u751f\u6210\u4ee5\u4e0b\u56de\u7b54\uff1a<br>o\u2081: \u201c\u7b54\u6848\u662f13\u3002\u201d<br>o\u2082: \u201c\u5341\u4e09\u3002\u201d<br>o\u2083: \u201c\u662f12\u3002\u201d<br>o\u2084: \u201c\u548c\u662f13\u3002\u201d<\/p>\n\n\n\n<p><strong>\u7b2c\u4e09\u6b65\uff1a\u4e3a\u6bcf\u4e2a\u56de\u7b54\u8ba1\u7b97\u5956\u52b1<\/strong><\/p>\n\n\n\n<p><strong>\u4ec0\u4e48\u662f\u5956\u52b1\uff1f<\/strong>\u5956\u52b1\u901a\u8fc7\u91cf\u5316\u6a21\u578b\u56de\u7b54\u7684\u8d28\u91cf\u6765\u6307\u5bfc\u5176\u5b66\u4e60\u3002<\/p>\n\n\n\n<p><strong>GRPO\u4e2d\u7684\u5956\u52b1\u7c7b\u578b\uff1a<\/strong><\/p>\n\n\n\n<ul><li><strong>\u51c6\u786e\u6027\u5956\u52b1<\/strong>\uff1a\u57fa\u4e8e\u56de\u7b54\u7684\u6b63\u786e\u6027\uff08\u4f8b\u5982\uff0c\u89e3\u51b3\u6570\u5b66\u95ee\u9898\uff09\u3002<\/li><li><strong>\u683c\u5f0f\u5956\u52b1<\/strong>\uff1a\u786e\u4fdd\u56de\u7b54\u9075\u5faa\u7ed3\u6784\u5316\u7684\u6307\u5bfc\uff08\u4f8b\u5982\uff0c\u7528 \u6807\u7b7e\u5305\u88f9\u7684\u63a8\u7406\u8fc7\u7a0b\uff09\u3002<\/li><li><strong>\u8bed\u8a00\u4e00\u81f4\u6027\u5956\u52b1<\/strong>\uff1a\u60e9\u7f5a\u8bed\u8a00\u6df7\u6742\u6216\u683c\u5f0f\u4e0d\u8fde\u8d2f\u7684\u60c5\u51b5\u3002<\/li><\/ul>\n\n\n\n<p>\u57fa\u4e8e\u5176\u8868\u73b0\uff0c\u4e3a\u6bcf\u4e2a\u56de\u7b54\u5206\u914d\u4e00\u4e2a\u5956\u52b1 (r\u1d62)\u3002\u4f8b\u5982\uff0c\u5956\u52b1\u53ef\u80fd\u53d6\u51b3\u4e8e\uff1a<\/p>\n\n\n\n<ul><li><strong>\u51c6\u786e\u6027<\/strong>\uff1a\u7b54\u6848\u662f\u5426\u6b63\u786e\uff1f<\/li><li><strong>\u683c\u5f0f<\/strong>\uff1a\u56de\u7b54\u662f\u5426\u7ed3\u6784\u826f\u597d\uff1f<\/li><\/ul>\n\n\n\n<p><strong>\u793a\u4f8b\uff1a<\/strong><br>r\u2081 = 1.0\uff08\u6b63\u786e\u4e14\u683c\u5f0f\u826f\u597d\uff09\u3002<br>r\u2082 = 0.9\uff08\u6b63\u786e\u4f46\u4e0d\u591f\u6b63\u5f0f\uff09\u3002<br>r\u2083 = 0.0\uff08\u56de\u7b54\u9519\u8bef\uff09\u3002<br>r\u2084 = 1.0\uff08\u6b63\u786e\u4e14\u683c\u5f0f\u826f\u597d\uff09\u3002<\/p>\n\n\n\n<p><strong>\u7b2c\u56db\u6b65\uff1a\u6bd4\u8f83\u56de\u7b54\uff08\u7fa4\u4f53\u4f18\u52bf\uff09<\/strong><\/p>\n\n\n\n<ul><li>\u8ba1\u7b97\u6bcf\u4e2a\u56de\u7b54\u76f8\u5bf9\u4e8e\u7fa4\u4f53\u7684\u4f18\u52bf (A\u1d62)\uff1a<\/li><\/ul>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-39.png\" alt=\"\" class=\"wp-image-26940\" width=\"506\" height=\"539\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-39.png 709w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-39-281x300.png 281w\" sizes=\"(max-width: 506px) 100vw, 506px\" \/><\/figure>\n\n\n\n<ul><li>\u8868\u73b0\u4f18\u4e8e\u7fa4\u4f53\u5e73\u5747\u6c34\u5e73\u7684\u56de\u7b54\u4f1a\u83b7\u5f97\u6b63\u5206\uff0c\u800c\u8868\u73b0\u8f83\u5dee\u7684\u56de\u7b54\u5219\u4f1a\u83b7\u5f97\u8d1f\u5206\u3002<\/li><li>\u8fd9\u79cd\u65b9\u6cd5\u5728\u7ec4\u5185\u5f15\u5165\u4e86\u7ade\u4e89\u673a\u5236\uff0c\u63a8\u52a8\u6a21\u578b\u751f\u6210\u66f4\u597d\u7684\u56de\u7b54\u3002<\/li><\/ul>\n\n\n\n<p><strong>\u7b2c\u4e94\u6b65\uff1a\u4f7f\u7528\u622a\u65ad\u6280\u672f\u66f4\u65b0\u7b56\u7565<\/strong><\/p>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-40.png\" alt=\"\" class=\"wp-image-26944\" width=\"538\" height=\"156\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-40.png 793w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-40-300x87.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-40-768x222.png 768w\" sizes=\"(max-width: 538px) 100vw, 538px\" \/><\/figure>\n\n\n\n<ul><li>\u793a\u4f8b\uff1a\u5982\u679c\u65b0\u7b56\u7565\u5f00\u59cb\u4e3a o\u2081 \u5206\u914d\u8fc7\u591a\u6982\u7387\uff0c\u622a\u65ad\u6280\u672f\u4f1a\u786e\u4fdd\u5b83\u4e0d\u4f1a\u8fc7\u5ea6\u5f3a\u8c03\u8fd9\u4e2a\u56de\u7b54\u3002<\/li><li>\u8fd9\u79cd\u65b9\u6cd5\u5373\u4f7f\u5728\u590d\u6742\u4efb\u52a1\uff08\u5982\u63a8\u7406\uff09\u4e2d\uff0c\u4e5f\u80fd\u5b9e\u73b0\u7a33\u5b9a\u53ef\u9760\u7684\u7b56\u7565\u4f18\u5316\u3002<\/li><\/ul>\n\n\n\n<p><strong>\u7b2c\u516d\u6b65\uff1a\u4f7f\u7528KL\u6563\u5ea6\u60e9\u7f5a\u504f\u79bb<\/strong><\/p>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-41.png\" alt=\"\" class=\"wp-image-26949\" width=\"592\" height=\"91\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-41.png 787w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-41-300x46.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-41-768x118.png 768w\" sizes=\"(max-width: 592px) 100vw, 592px\" \/><\/figure>\n\n\n\n<p><strong>\u6574\u4f53\u6d41\u7a0b<\/strong><\/p>\n\n\n\n<p>GRPO \u76ee\u6807\u51fd\u6570\u7684\u6267\u884c\u8fc7\u7a0b\u5982\u4e0b\uff1a<\/p>\n\n\n\n<ol><li>\u4e3a\u67e5\u8be2\u751f\u6210\u4e00\u7ec4\u56de\u7b54\u3002<\/li><li>\u6839\u636e\u9884\u5b9a\u4e49\u6807\u51c6\uff08\u4f8b\u5982\u51c6\u786e\u6027\u3001\u683c\u5f0f\uff09\u4e3a\u6bcf\u4e2a\u56de\u7b54\u8ba1\u7b97\u5956\u52b1\u3002<\/li><li>\u5728\u7ec4\u5185\u6bd4\u8f83\u56de\u7b54\uff0c\u8ba1\u7b97\u5b83\u4eec\u7684\u76f8\u5bf9\u4f18\u52bf\uff08A_i\uff09\u3002<\/li><li>\u66f4\u65b0\u7b56\u7565\uff0c\u4ee5\u503e\u5411\u4e8e\u5177\u6709\u66f4\u9ad8\u4f18\u52bf\u7684\u56de\u7b54\uff0c\u5e76\u901a\u8fc7\u622a\u65ad\u6280\u672f\u786e\u4fdd\u7a33\u5b9a\u6027\u3002<\/li><li>\u5bf9\u66f4\u65b0\u8fdb\u884c\u6b63\u5219\u5316\uff0c\u9632\u6b62\u6a21\u578b\u504f\u79bb\u5176\u57fa\u7ebf\u592a\u8fdc\u3002<\/li><\/ol>\n\n\n\n<p><strong>\u4e3a\u4ec0\u4e48GRPO\u6709\u6548\uff1f<\/strong><\/p>\n\n\n\n<ol><li><strong>\u65e0\u9700\u8bc4\u5224\u5668<\/strong>\uff1aGRPO \u901a\u8fc7\u4f9d\u8d56\u7ec4\u5185\u6bd4\u8f83\uff0c\u907f\u514d\u4e86\u5355\u72ec\u8bc4\u4f30\u5668\u7684\u9700\u6c42\uff0c\u964d\u4f4e\u4e86\u8ba1\u7b97\u6210\u672c\u3002<\/li><li><strong>\u7a33\u5b9a\u5b66\u4e60<\/strong>\uff1a\u622a\u65ad\u6280\u672f\u548cKL\u6b63\u5219\u5316\u786e\u4fdd\u6a21\u578b\u7a33\u6b65\u6539\u8fdb\uff0c\u4e0d\u4f1a\u51fa\u73b0\u5267\u70c8\u6ce2\u52a8\u3002<\/li><li><strong>\u9ad8\u6548\u8bad\u7ec3<\/strong>\uff1a\u901a\u8fc7\u5173\u6ce8\u76f8\u5bf9\u8868\u73b0\uff0cGRPO \u7279\u522b\u9002\u5408\u50cf\u63a8\u7406\u8fd9\u6837\u7684\u4efb\u52a1\uff0c\u56e0\u4e3a\u8fd9\u4e9b\u4efb\u52a1\u5f88\u96be\u7528\u7edd\u5bf9\u8bc4\u5206\u8861\u91cf\u3002<\/li><\/ol>\n\n\n\n<h2>RLOO\uff08REINFORCE Leave-One-Out\uff09<\/h2>\n\n\n\n<p><a href=\"https:\/\/arxiv.org\/abs\/2402.14740\">Back to Basics: Revisiting REINFORCE Style Optimization for Learning from Human Feedback in LLMs<\/a><\/p>\n\n\n\n<p>\u5728RLHF\u8fc7\u7a0b\u4e2d\uff0cPPO\u662f\u6700\u5e38\u7528\u7684\u5bf9\u9f50\u7b97\u6cd5\u3002PPO\u662f\u4e00\u79cd\u5305\u542b\u4e86\u5f88\u591a\u6280\u5de7\u7684\u5f3a\u5316\u5b66\u4e60\u7b97\u6cd5\uff0c\u4f8b\u5982GAE\uff0cimportance weight\uff0cpolicy\/value\u7684clip\u7b49\u7b49\u3002\u672c\u6587\u63d0\u51fa\u7684\u89c2\u70b9\u662f\uff0c\u76f4\u63a5\u91c7\u7528\u66f4\u7b80\u5355\u7684policy gradient\u7c7b\u5f3a\u5316\u5b66\u4e60\u7b97\u6cd5\u4e5f\u53ef\u4ee5\u53d6\u5f97\u5f88\u597d\u7684\u6548\u679c\uff0cPPO\u4e2d\u7684value\u6a21\u578b\uff0cclip\u64cd\u4f5c\u7b49\u6a21\u5757\u53ef\u80fd\u5e76\u4e0d\u6709\u6548\u3002\u672c\u6587\u63d0\u51fa\u7684RLOO\uff08REINFORCE Leave-One-Out\uff09\u7b97\u6cd5\u5728\u591a\u79cd\u5927\u6a21\u578b\u4efb\u52a1\u4e2d\u90fd\u53d6\u5f97\u4e86\u4f18\u4e8ePPO\/DPO\u7684\u7ed3\u679c\uff0c\u540c\u65f6\u4e5f\u5bf9\u566a\u58f0\u548cKL\u7ea6\u675f\u66f4robust\u3002<\/p>\n\n\n\n<p><strong>\u6838\u5fc3\u7406\u5ff5<\/strong>\uff1a\u5728\u7ecf\u5178 REINFORCE \u4e2d\u5f15\u5165 per-prompt baseline\uff0c\u5373\u201c\u7559\u4e00\u6cd5\u201d\uff08Leave-One-Out\uff09\u6765\u4f30\u8ba1 baseline\uff0c\u663e\u8457\u964d\u4f4e\u68af\u5ea6\u65b9\u5dee\uff0c\u5728\u7ebf RLHF\uff0c\u65e0\u9700 critic \u7f51\u7edc\u3002<\/p>\n\n\n\n<p>PPO\u7b97\u6cd5\u662f\u7531policy gradient\/actor-critic\u7b49\u5f3a\u5316\u5b66\u4e60\u7b97\u6cd5\u53d1\u5c55\u800c\u6765\u7684\u7b97\u6cd5\uff0c\u5df2\u7ecf\u5728\u4f17\u591a\u7684\u5f3a\u5316\u5b66\u4e60\u7ecf\u5178\u4efb\u52a1\u4e2d\u9a8c\u8bc1\u4e86\u6548\u679c\u3002\u7136\u800c\u628aPPO\u7528\u5728LLM\u4e2d\u4f1a\u6709\u8ba1\u7b97\u6210\u672c\u9ad8\uff08\u9700\u8981\u52a0\u8f7dpolicy\/ref-policy\/value\/reward\/4\u4e2a\u6a21\u578b\uff09\uff0c\u4f17\u591a\u6a21\u5757\u8026\u5408\u5728\u4e00\u8d77\u96be\u4ee5\u5224\u65ad\u9519\u8bef\u51fa\u73b0\u7684\u6e90\u5934\uff0c\u4ee5\u53ca\u7b97\u6cd5\u8868\u73b0\u4e0d\u7a33\u5b9a\u7b49\u95ee\u9898\u3002<\/p>\n\n\n\n<p>\u4e3a\u4e86\u89e3\u51b3\u4e0a\u8ff0\u95ee\u9898\uff0c\u672c\u6587\u4f7f\u7528\u66f4\u539f\u59cb\uff0c\u66f4\u7b80\u5355\u7684RL\u7b97\u6cd5\u53bb\u66ff\u4ee3PPO\u3002RL\u4e2dpolicy-based\u6700\u57fa\u7840\u7684reinforce\u7b97\u6cd5\u5982\u4e0b\u6240\u793a\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-54.png\" alt=\"\" class=\"wp-image-27838\" width=\"407\" height=\"47\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-54.png 652w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-54-300x35.png 300w\" sizes=\"(max-width: 407px) 100vw, 407px\" \/><\/figure>\n\n\n\n<p>b\u8868\u793abaseline\uff0c\u7528\u6765\u964d\u4f4e\u65b9\u5dee\u3002RLOO\u4f7f\u7528\u4e00\u79cd\u8499\u7279\u5361\u6d1b\u7684\u65b9\u5f0f\u53bb\u8ba1\u7b97b\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-55.png\" alt=\"\" class=\"wp-image-27839\" width=\"612\" height=\"67\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-55.png 898w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-55-300x33.png 300w\" sizes=\"(max-width: 612px) 100vw, 612px\" \/><\/figure>\n\n\n\n<p>\u8fd9\u6837\u7684\u65b9\u5f0f\u80fd\u591f\u907f\u514d\u4f7f\u7528value model\u548cGAE\uff0c\u51cf\u5c11\u663e\u5b58\u5360\u7528\u3002PPO\u4f7f\u7528GAE\u7684\u65b9\u5f0f\u6765\u5e73\u8861\u8bef\u5dee\u548c\u65b9\u5dee\u3002\u4e0ePPO\u76f8\u6bd4\uff0c<strong>reinforce\u7b97\u6cd5\u7684\u65b9\u5dee\u66f4\u5927\uff0c\u4f46\u662f\u7531\u4e8e\u9884\u8bad\u7ec3\u51fa\u7684\u6a21\u578b\u8db3\u591f\u5f3a\u5927\uff0c\u65b9\u5dee\u4e0d\u662f\u4e3b\u8981\u95ee\u9898\uff0c\u7528RLOO\u7684\u5f62\u5f0f\u53bb\u8fdb\u884c\u68af\u5ea6\u66f4\u65b0\u662f\u53ef\u4ee5\u63a5\u53d7\u7684\u3002<\/strong><\/p>\n\n\n\n<h2>REINFORCE++: \u6bd4 GRPO \u7a33\u5b9a\u6bd4PPO\u5feb<\/h2>\n\n\n\n<p><a rel=\"noreferrer noopener\" href=\"https:\/\/arxiv.org\/abs\/2501.03262\" target=\"_blank\">An Efficient RLHF Algorithm with Robustness to Both Prompt and Reward Models<\/a><\/p>\n\n\n\n<p><a href=\"https:\/\/arxiv.org\/html\/2501.03262v1\">REINFORCE++: A Simple and Efficient Approach for Aligning Large Language Models<\/a><\/p>\n\n\n\n<p>REINFORCE++-baseline \u5728 reasoning \u7b49\u4efb\u52a1\u4e2d\u975e\u5e38\u597d\u7528\uff0c<strong>\u5168\u5c40\u7684\u6807\u51c6\u5dee\u5f52\u4e00\u5316\u907f\u514d\u4e86 std \u592a\u5c0f\u5e26\u6765\u7684 advantage\u4e0d\u7a33\u5b9a<\/strong>\u3002<\/p>\n\n\n\n<p><strong>REINFORCE++\u7684\u6838\u5fc3\u601d\u60f3\u662f\u5c06PPO\u4e2d\u7684\u5404\u79cd\u4f18\u5316\u6280\u5de7\u6574\u5408\u5230\u7ecf\u5178\u7684\u5f3a\u5316\u5b66\u4e60\u7b97\u6cd5REINFORCE\u4e2d\uff0c\u4ee5\u63d0\u5347\u5176\u6027\u80fd\u548c\u7a33\u5b9a\u6027\u3002<\/strong>\u8fd9\u6837REINFORCE++\u4e0d\u9700\u8981 Critic \u4ece\u800c\u8282\u7701\u8ba1\u7b97\u8d44\u6e90\uff0c\u53c8\u6709\u52a0\u6301\u4e86 PPO \u76f8\u5173\u7684\u4f18\u5316\u6280\u5de7\u5b9e\u73b0\u9ad8\u6548\u8bad\u7ec3\u3002\u00a0<strong>REINFORCE++\u7684\u7279\u70b9\u662f \u6bd4 GRPO \u7a33\u5b9a\u6bd4PPO\u5feb\u3002<\/strong><\/p>\n\n\n\n<p>REINFORCE\u7b97\u6cd5\u57fa\u4e8e\u8499\u7279\u5361\u7f57\u65b9\u6cd5\uff0c\u901a\u8fc7\u4ee5\u4e0b\u6b65\u9aa4\u8fdb\u884c\u64cd\u4f5c\uff1a<\/p>\n\n\n\n<p>&#8211; <strong>\u7b56\u7565\u91c7\u6837<\/strong>\uff1a\u667a\u80fd\u4f53\u6839\u636e\u5f53\u524d\u7b56\u7565\u4e0e\u73af\u5883\u4ea4\u4e92\uff0c\u751f\u6210\u4e00\u6761\u72b6\u6001-\u52a8\u4f5c-\u5956\u52b1\u5e8f\u5217\uff08\u8f68\u8ff9\uff09\u3002<\/p>\n\n\n\n<p>&#8211; <strong>\u56de\u62a5\u8ba1\u7b97<\/strong>\uff1a\u5bf9\u6bcf\u6761\u8f68\u8ff9\u8fdb\u884c\u56de\u62a5\u8ba1\u7b97\uff0c\u901a\u5e38\u91c7\u7528\u6298\u6263\u7d2f\u8ba1\u5956\u52b1\u7684\u5f62\u5f0f\uff0c\u5373\uff1a<\/p>\n\n\n\n<div class=\"wp-block-image\"><figure class=\"aligncenter size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-49.png\" alt=\"\" class=\"wp-image-27815\" width=\"146\" height=\"34\"\/><\/figure><\/div>\n\n\n\n<p>\u5176\u4e2d\uff0c\u00a0<em>\u03b3\u00a0<\/em>\u662f\u6298\u6263\u56e0\u5b50\uff0c\u00a0<em>r<sub>k<\/sub><\/em>\u00a0\u662f\u5728\u65f6\u95f4\u6b65<em>\u00a0k<\/em>\u00a0\u83b7\u5f97\u7684\u5373\u65f6\u5956\u52b1\u3002<\/p>\n\n\n\n<p>\u00a0<strong>\u68af\u5ea6\u4f30\u8ba1<\/strong>\uff1a\u4f7f\u7528\u8499\u7279\u5361\u7f57\u65b9\u6cd5\u8ba1\u7b97\u7b56\u7565\u68af\u5ea6\uff0c\u66f4\u65b0\u7b56\u7565\u53c2\u6570\u00a0\u03b8\u00a0\u7684\u516c\u5f0f\u4e3a\uff1a<\/p>\n\n\n\n<div class=\"wp-block-image\"><figure class=\"aligncenter size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-50.png\" alt=\"\" class=\"wp-image-27818\" width=\"253\" height=\"34\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-50.png 490w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-50-300x40.png 300w\" sizes=\"(max-width: 253px) 100vw, 253px\" \/><\/figure><\/div>\n\n\n\n<p>&#8211; <strong>\u7b56\u7565\u66f4\u65b0<\/strong>\uff1a\u901a\u8fc7\u68af\u5ea6\u4e0a\u5347\u6cd5\u66f4\u65b0\u7b56\u7565\u53c2\u6570\uff1a<\/p>\n\n\n\n<div class=\"wp-block-image\"><figure class=\"aligncenter size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-51.png\" alt=\"\" class=\"wp-image-27821\" width=\"190\" height=\"22\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-51.png 313w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-51-300x35.png 300w\" sizes=\"(max-width: 190px) 100vw, 190px\" \/><\/figure><\/div>\n\n\n\n<p>\u5176\u4e2d\uff0c&nbsp;\u03b1&nbsp;\u662f\u5b66\u4e60\u7387\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"1024\" height=\"523\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-45-1024x523.png\" alt=\"\" class=\"wp-image-27803\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-45-1024x523.png 1024w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-45-300x153.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-45-768x392.png 768w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-45.png 1333w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<h3>RLHF Implementation Tricks<\/h3>\n\n\n\n<p>\u5728 REINFORCE \u4e0a\u96c6\u6210\u4e0b\u9762\u7684\u4f18\u5316 Tricks \u4ee5\u7a33\u5b9a\u6a21\u578b\u7684\u8bad\u7ec3\u3002<\/p>\n\n\n\n<p><strong> Token Level KL-Penalty  <\/strong><\/p>\n\n\n\n<p>Token Level KL-Penalty \u662f\u4e00\u79cd\u5728\u5e8f\u5217\u751f\u6210\u4efb\u52a1\u4e2d\u4f7f\u7528\u7684\u6b63\u5219\u5316\u6280\u672f\u3002\u5176\u4e3b\u8981\u76ee\u7684\u662f\u63a7\u5236\u751f\u6210\u7684\u6587\u672c\u4e0e\u8bad\u7ec3\u6570\u636e\u4e4b\u95f4\u7684\u5dee\u5f02\uff0c\u4ee5\u907f\u514d\u6a21\u578b\u751f\u6210\u8fc7\u4e8e\u504f\u79bb\u8bad\u7ec3\u5206\u5e03\u7684\u8f93\u51fa\u3002\u5177\u4f53\u65b9\u6cd5\u5982\u4e0b\uff1a<\/p>\n\n\n\n<div class=\"wp-block-image\"><figure class=\"aligncenter size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-52.png\" alt=\"\" class=\"wp-image-27827\" width=\"276\" height=\"113\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-52.png 463w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-52-300x123.png 300w\" sizes=\"(max-width: 276px) 100vw, 276px\" \/><\/figure><\/div>\n\n\n\n<p>\u8fd9\u79cd Token-level KL \u7684\u597d\u5904\u662f\u53ef\u4ee5\u65e0\u7f1d\u517c\u5bb9 PRM \u5e76\u4e14\u5b9e\u73b0\u4e86KL reward\u7684\u4fe1\u7528\u5206\u914d (\u66f4\u65b0\uff1a\u6700\u8fd1\u4e5f\u6709\u7f51\u53cb\u63d0\u5230\u7528GRPO\u7684\u5916\u7f6ekl \u4e5f\u53ef\u4ee5)<\/p>\n\n\n\n<p><strong>Mini-batch Updates<\/strong><\/p>\n\n\n\n<p>Mini-batch Updates \u662f\u4e00\u79cd\u5e38\u7528\u7684\u4f18\u5316\u7b56\u7565\uff0c\u65e8\u5728\u63d0\u9ad8\u8bad\u7ec3\u6548\u7387\u548c\u7a33\u5b9a\u6027\u3002\u5176\u57fa\u672c\u601d\u60f3\u662f\uff1a<\/p>\n\n\n\n<ul><li>&#8211; \u5c0f\u6279\u91cf\u6837\u672c\uff1a\u5c06\u8bad\u7ec3\u6570\u636e\u5212\u5206\u4e3a\u591a\u4e2a\u5c0f\u6279\u91cf\uff08mini-batch\uff09\uff0c\u800c\u4e0d\u662f\u4f7f\u7528\u6574\u4e2a\u6570\u636e\u96c6\u8fdb\u884c\u66f4\u65b0\u3002<\/li><li>&#8211; \u9891\u7e41\u66f4\u65b0\uff1a\u901a\u8fc7\u5728\u6bcf\u4e2a\u5c0f\u6279\u91cf\u4e0a\u8fdb\u884c\u591a\u6b21\u53c2\u6570\u66f4\u65b0\uff0c\u53ef\u4ee5\u66f4\u5feb\u5730\u6536\u655b\uff0c\u540c\u65f6\u51cf\u5c11\u5185\u5b58\u6d88\u8017\u3002<\/li><li>&#8211; \u968f\u673a\u6027\u5f15\u5165\uff1a\u5c0f\u6279\u91cf\u66f4\u65b0\u5f15\u5165\u4e86\u968f\u673a\u6027\uff0c\u6709\u52a9\u4e8e\u907f\u514d\u5c40\u90e8\u6700\u4f18\u89e3\uff0c\u63d0\u9ad8\u6a21\u578b\u7684\u6cdb\u5316\u80fd\u529b\u3002<\/li><\/ul>\n\n\n\n<p><strong>Reward Normalization and Clipping<\/strong><\/p>\n\n\n\n<p>Reward Normalization and Clipping \u662f\u5904\u7406\u5956\u52b1\u4fe1\u53f7\u4e0d\u7a33\u5b9a\u7684\u4e00\u79cd\u65b9\u6cd5\u3002\u5177\u4f53\u5305\u62ec\uff1a<\/p>\n\n\n\n<p>-\u5956\u52b1\u5f52\u4e00\u5316\uff1a\u901a\u8fc7\u5bf9\u5956\u52b1\u8fdb\u884c\u6807\u51c6\u5316\uff08\u4f8b\u5982\uff0c\u51cf\u53bb\u5747\u503c\u5e76\u9664\u4ee5\u6807\u51c6\u5dee\uff09\uff0c\u4f7f\u5f97\u5956\u52b1\u4fe1\u53f7\u66f4\u4e3a\u5e73\u7a33\uff0c\u4ece\u800c\u63d0\u9ad8\u8bad\u7ec3\u8fc7\u7a0b\u7684\u7a33\u5b9a\u6027\u3002<\/p>\n\n\n\n<p>&#8211; \u5956\u52b1\u88c1\u526a\uff1a\u9650\u5236\u5956\u52b1\u503c\u5728\u67d0\u4e2a\u8303\u56f4\u5185\uff0c\u4ee5\u9632\u6b62\u6781\u7aef\u5956\u52b1\u5bf9\u6a21\u578b\u66f4\u65b0\u9020\u6210\u8fc7\u5927\u7684\u5f71\u54cd\u3002\u8fd9\u6709\u52a9\u4e8e\u4fdd\u6301\u5b66\u4e60\u8fc7\u7a0b\u7684\u7a33\u5b9a\u6027\uff0c\u5e76\u9632\u6b62\u68af\u5ea6\u7206\u70b8\u3002<\/p>\n\n\n\n<p><strong> Advantage Normalization  <\/strong><\/p>\n\n\n\n<p>Advantage Normalization \u662f\u4e00\u79cd\u7528\u4e8e\u5904\u7406\u4f18\u52bf\u51fd\u6570\uff08advantage function\uff09\u4f30\u8ba1\u65b9\u5dee\u7684\u65b9\u6cd5\u3002<strong>REINFORCE++\u7684\u4f18\u52bf\u51fd\u6570\u5b9a\u4e49\u4e3a<\/strong>\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"1024\" height=\"238\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-47-1024x238.png\" alt=\"\" class=\"wp-image-27810\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-47-1024x238.png 1024w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-47-300x70.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-47-768x178.png 768w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-47.png 1233w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p>\u5176\u4e2d&nbsp;r&nbsp;\u662fOutcome\u5956\u52b1\u51fd\u6570\uff0c&nbsp;KL&nbsp;\u662fper-token \u7684kl reward\uff0c&nbsp;t&nbsp;\u662ftoken\u4f4d\u7f6e\u3002<\/p>\n\n\n\n<p><strong>\u4f18\u52bf\u5f52\u4e00\u5316\u7684\u6b65\u9aa4\u5305\u62ec\uff1a<\/strong><\/p>\n\n\n\n<p>&#8211; **\u5747\u503c\u548c\u65b9\u5dee\u8ba1\u7b97**\uff1a\u5bf9\u4e00\u4e2abatch\u8ba1\u7b97\u51fa\u7684\u4f18\u52bf\u503c\u8fdb\u884c\u5747\u503c\u548c\u65b9\u5dee\u8ba1\u7b97\u3002<\/p>\n\n\n\n<p>&#8211; **\u5f52\u4e00\u5316\u5904\u7406**\uff1a\u5c06\u4f18\u52bf\u503c\u51cf\u53bb\u5747\u503c\u5e76\u9664\u4ee5\u6807\u51c6\u5dee\uff0c\u4f7f\u5f97\u4f18\u52bf\u503c\u5177\u6709\u66f4\u597d\u7684\u6570\u503c\u7a33\u5b9a\u6027\uff0c\u8fdb\u800c\u63d0\u9ad8\u5b66\u4e60\u6548\u679c\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"1024\" height=\"429\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-48-1024x429.png\" alt=\"\" class=\"wp-image-27811\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-48-1024x429.png 1024w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-48-300x126.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-48-768x322.png 768w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-48.png 1219w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p><strong>PPO-Clip<\/strong><\/p>\n\n\n\n<p>PPO-Clip \u662f\u8fd1\u7aef\u7b56\u7565\u4f18\u5316\uff08Proximal Policy Optimization, PPO\uff09\u7b97\u6cd5\u4e2d\u7684\u4e00\u4e2a\u5173\u952e\u6280\u5de7\uff0c\u7528\u4e8e\u9650\u5236\u7b56\u7565\u66f4\u65b0\u5e45\u5ea6\u3002\u5176\u4e3b\u8981\u601d\u60f3\u662f\uff1a<\/p>\n\n\n\n<p><strong>\u526a\u5207\u76ee\u6807\u51fd\u6570<\/strong>\uff1a\u901a\u8fc7\u5f15\u5165\u4e00\u4e2a\u526a\u5207\u673a\u5236\uff0c\u9650\u5236\u65b0\u65e7\u7b56\u7565\u4e4b\u95f4\u7684\u6bd4\u7387\u53d8\u5316\uff0c\u786e\u4fdd\u66f4\u65b0\u4e0d\u4f1a\u8fc7\u5927\u3002\u8fd9\u53ef\u4ee5\u7528\u4ee5\u4e0b\u516c\u5f0f\u8868\u793a\uff1a<\/p>\n\n\n\n<div class=\"wp-block-image\"><figure class=\"aligncenter size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-53.png\" alt=\"\" class=\"wp-image-27829\" width=\"433\" height=\"77\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-53.png 838w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-53-300x54.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-53-768x138.png 768w\" sizes=\"(max-width: 433px) 100vw, 433px\" \/><\/figure><\/div>\n\n\n\n<p><strong>\u63d0\u9ad8\u7a33\u5b9a\u6027\u548c\u6837\u672c\u6548\u7387<\/strong>\uff1a\u8fd9\u79cd\u526a\u5207\u673a\u5236\u6709\u6548\u9632\u6b62\u4e86\u7b56\u7565\u66f4\u65b0\u8fc7\u5927\u5bfc\u81f4\u7684\u4e0d\u7a33\u5b9a\uff0c\u63d0\u9ad8\u4e86\u7b97\u6cd5\u7684\u6536\u655b\u901f\u5ea6\u548c\u6837\u672c\u6548\u7387\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"1024\" height=\"476\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-46-1024x476.png\" alt=\"\" class=\"wp-image-27808\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-46-1024x476.png 1024w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-46-300x140.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-46-768x357.png 768w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-46.png 1264w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<h2>ORPO\u504f\u597d\u4f18\u5316\uff08Odds Ratio Preference Optimization\uff09<\/h2>\n\n\n\n<p><strong>ORPO: <a href=\"https:\/\/arxiv.org\/abs\/2403.07691\" target=\"_blank\" rel=\"noreferrer noopener\">Monolithic Preference Optimization without Reference Model<\/a><\/strong><\/p>\n\n\n\n<p class=\"has-light-pink-background-color has-background\"><strong>\u6838\u5fc3\uff1a<\/strong> <strong>\u6700\u5927\u5316\u6b63\u6837\u672c\u7684\u751f\u6210\u6982\u7387\uff0c\u6700\u5c0f\u5316\u8d1f\u6837\u672c\u7684\u751f\u6210\u6982\u7387<\/strong> \u3002\u76f8\u6bd4DPO \u3010\u52a0\u8f7d2\u4e2a\u6a21\u578b\uff0c\u5176\u4e2d\u4e00\u4e2a\u63a8\u7406\uff0c\u53e6\u5916\u4e00\u4e2a\u8bad\u7ec3\uff0c\u76f4\u63a5\u5728\u504f\u597d\u6570\u636e\u4e0a\u8fdb\u884c\u8bad\u7ec3\u3011<strong>\uff0c\u53ea\u52a0\u8f7d\u8bad\u7ec3\u6a21\u578b\uff0c\u76f4\u63a5\u5728\u504f\u597d\u6570\u636e\u4e0a\u8fdb\u884c\u8bad\u7ec3<\/strong>\u3002<\/p>\n\n\n\n<p><strong>\u672c\u6587\u63d0\u51fa\u7684\u7b97\u6cd5ORPO\u662f\u5bf9SFT\u7684\u6539\u8fdb\uff0c\u901a\u8fc7\u4fee\u6539SFT\u9636\u6bb5\u7684\u635f\u5931\u51fd\u6570\uff0c\u5c06\u7c7b\u4f3c\u4e8eDPO\u4e2d\u504f\u597d\u5bf9\u9f50\u7684\u601d\u60f3\u5f15\u5165\u5230SFT\u4e2d\uff0c\u63d0\u51fa\u4e00\u79cd\u65e0\u9700\u5956\u52b1\u6a21\u578b\u548c\u53c2\u8003\u6a21\u578b\u7b97\u6cd5\u3002\u540c\u65f6\uff0cORPO\u53ea\u6709\u4e00\u9636\u6bb5\uff0c\u4e0d\u9700\u8981\u50cfDPO\u4e00\u6837\u9700\u8981\u5148SFT\u518dDPO\u5bf9\u9f50\u3002\u5728\u4f17\u591a\u5927\u6a21\u578b\u4efb\u52a1\u4e0a\u7684\u5b9e\u9a8c\u7ed3\u679c\u8868\u660e\uff0c\u4e0eSFT\uff0cDPO\u7b49\u7b97\u6cd5\u76f8\u6bd4\uff0cORPO\u66f4\u6709\u4f18\u52bf\u3002<\/strong><\/p>\n\n\n\n<p>\u672c\u6587\u63d0\u51fa\u7684\u7b97\u6cd5ORPO\u662f\u5bf9SFT\u7684\u6539\u8fdb\uff0c\u4fee\u6539\u4e86SFT\u9636\u6bb5\u7684\u635f\u5931\u51fd\u6570\u3002\u540c\u65f6\uff0c\u4e0eDPO\/PPO\u76f8\u6bd4\uff0cORPO\u5c06\u539f\u672c\u5206\u4e24\u6b65\u8fdb\u884c\u7684\u8fc7\u7a0b\uff08SFT+DPO\/PPO\uff09\u5408\u5e76\u4e3a\u4e00\u6b65\uff0c\u66f4\u52a0\u7b80\u6d01\u9ad8\u6548\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"1024\" height=\"305\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-42-1024x305.png\" alt=\"\" class=\"wp-image-24738\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-42-1024x305.png 1024w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-42-300x89.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-42-768x229.png 768w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-42.png 1150w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p>\u73b0\u5728\u6709\u8bb8\u591a\u65b9\u6cd5\u53ef\u4ee5\u4f7f\u5927\u578b\u8bed\u8a00\u6a21\u578b\uff08LLM\uff09\u4e0e\u4eba\u7c7b\u504f\u597d\u4fdd\u6301\u4e00\u81f4\u3002\u4ee5\u4eba\u7c7b\u53cd\u9988\u4e3a\u57fa\u7840\u7684\u5f3a\u5316\u5b66\u4e60\uff08RLHF\uff09\u662f\u6700\u65e9\u7684\u65b9\u6cd5\u4e4b\u4e00\uff0c\u5e76\u4fc3\u6210\u4e86ChatGPT\u7684\u8bde\u751f\uff0c\u4f46RLHF\u7684\u6210\u672c\u975e\u5e38\u9ad8\u3002\u4e0eRLHF\u76f8\u6bd4\uff0cDPO\u3001IPO\u548cKTO\u7684\u6210\u672c\u660e\u663e\u66f4\u4f4e\uff0c\u56e0\u4e3a\u5b83\u4eec\u4e0d\u9700\u8981\u5956\u52b1\u6a21\u578b\u3002<\/p>\n\n\n\n<p>\u867d\u7136DPO\u548cIPO\u7684\u6210\u672c\u8f83\u4f4e\uff0c\u4f46\u5b83\u4eec\u4ecd\u9700\u8bad\u7ec3\u4e24\u4e2a\u4e0d\u540c\u7684\u6a21\u578b\u3002<strong>\u9996\u5148\u662f\u76d1\u7763\u5fae\u8c03\uff08SFT\uff09\u6b65\u9aa4\uff0c\u5373\u8bad\u7ec3\u6a21\u578b\u6309\u6307\u4ee4\u56de\u7b54\u95ee\u9898\uff0c\u7136\u540e\u4f7f\u7528SFT\u6a21\u578b\u4f5c\u4e3a\u521d\u59cb\u5316\u548c\u53c2\u8003\uff0c\u4ee5\u4f7f\u6a21\u578b\u4e0e\u4eba\u7c7b\u504f\u597d\u4e00\u81f4\u3002<\/strong><\/p>\n\n\n\n<p><strong>ORPO\u662f\u53e6\u4e00\u79cd\u65b0\u7684LLM\u5bf9\u9f50\u65b9\u6cd5\uff0c\u8fd9\u79cd\u65b9\u6cd5\u751a\u81f3\u4e0d\u9700\u8981SFT\u6a21\u578b\u3002\u901a\u8fc7ORPO\uff0cLLM\u53ef\u4ee5\u540c\u65f6\u5b66\u4e60\u56de\u7b54\u6307\u4ee4\u548c\u6ee1\u8db3\u4eba\u7c7b\u504f\u597d\u3002<\/strong><\/p>\n\n\n\n<p>\u5bf9\u4e8eSTF\uff0c\u5b83\u662f\u5728\u4e0e\u9009\u62e9\u7684\u7b54\u6848\u914d\u5bf9\u7684\u63d0\u793a\u4e0a\u8fdb\u884c\u8bad\u7ec3\u7684<strong>\u3002\u7528\u4e8esft\u7684\u6570\u636e\u96c6\u53ef\u4ee5\u4e0e\u504f\u597d\u4f18\u5316\u4f7f\u7528\u7684\u76f8\u540c\uff0c\u4f46\u4e0d\u5305\u62ec&#8221;\u88ab\u62d2\u7edd&#8221;\u7684\u7b54\u6848<\/strong>\u3002\u6240\u4ee5\u53ef\u4ee5\u76f4\u89c2\u5730\u8ba4\u4e3a,<strong>\u5e94\u8be5\u80fd\u591f\u5fae\u8c03\u4e00\u4e2a\u57fa\u7840LLM,\u4f7f\u5176\u5728\u5b66\u4e60\u5982\u4f55\u56de\u7b54\u6307\u4ee4\u7684\u540c\u65f6,\u4e5f\u5b66\u4f1a\u60e9\u7f5a\u548c\u504f\u597d\u67d0\u4e9b\u7b54\u6848\u3002<\/strong><\/p>\n\n\n\n<p><strong>SFT\u53ea\u7528\u6b63\u6837\u672c\u66f4\u65b0\u7b56\u7565\uff0c\u6ca1\u6709\u8003\u8651\u5230\u8d1f\u6837\u672c\uff0c\u4f1a\u628a\u8d1f\u6837\u672c\u751f\u6210\u7684\u6982\u7387\u540c\u65f6\u62c9\u9ad8\uff0c\u5982\u4e0b\u56fe\u6240\u793a\uff1a<\/strong><\/p>\n\n\n\n<div class=\"wp-block-image\"><figure class=\"aligncenter size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-45.png\" alt=\"\" class=\"wp-image-24755\" width=\"410\" height=\"267\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-45.png 633w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-45-300x195.png 300w\" sizes=\"(max-width: 410px) 100vw, 410px\" \/><\/figure><\/div>\n\n\n\n<p>\u7531\u4e8e<strong>SFT\u7684\u635f\u5931\u51fd\u6570\u5bf9\u4e8erejected data\u6ca1\u6709\u60e9\u7f5a\u9879\uff0cSFT\u4e4b\u540e\u6b63\u6837\u672c\u548c\u8d1f\u6837\u672c\u7684\u751f\u6210\u6982\u7387\u6709\u53ef\u80fd\u540c\u65f6\u4e0a\u5347\u3002<\/strong><\/p>\n\n\n\n<p><strong>odds\u5b9a\u4e49\uff1a\u6a21\u578b\u03b8\u751f\u6210 \u8f93\u51fa\u5e8f\u5217y\u7684\u53ef\u80fd\u6027 \u6bd4 \u4e0d\u751f\u6210y\u5e8f\u5217\u7684\u53ef\u80fd\u6027 \u6bd4\u503c\u3002<\/strong><\/p>\n\n\n\n<div class=\"wp-block-image\"><figure class=\"aligncenter size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-46.png\" alt=\"\" class=\"wp-image-24760\" width=\"316\" height=\"52\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-46.png 538w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-46-300x49.png 300w\" sizes=\"(max-width: 316px) 100vw, 316px\" \/><\/figure><\/div>\n\n\n\n<p>OR\u4e3a\u6b63\u8d1f\u6837\u672c\u7684odds\u7684\u6bd4\u503c\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-47.png\" alt=\"\" class=\"wp-image-24767\" width=\"309\" height=\"51\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-47.png 528w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-47-300x49.png 300w\" sizes=\"(max-width: 309px) 100vw, 309px\" \/><\/figure>\n\n\n\n<p>ORPO\u7b97\u6cd5\u8981\u505a\u7684\u5c31\u662f\u6700\u5927\u5316OR\uff0c\u5373<strong>\u6700\u5927\u5316\u6b63\u6837\u672c\u7684\u751f\u6210\u6982\u7387\uff0c\u6700\u5c0f\u5316\u8d1f\u6837\u672c\u7684\u751f\u6210\u6982\u7387<\/strong>\uff0cLOR\u9879\u7528\u4e86\u548cDPO\u7c7b\u4f3c\u7684logsigmoid\u7684\u5f62\u5f0f\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-44.png\" alt=\"\" class=\"wp-image-24744\" width=\"350\" height=\"49\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-44.png 495w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-44-300x42.png 300w\" sizes=\"(max-width: 350px) 100vw, 350px\" \/><\/figure>\n\n\n\n<p>ORPO\u5c31\u662f\u5728\u8fd9\u4e2a\u7406\u8bba\u57fa\u7840\u4e0a\u5efa\u7acb\u7684\uff0c<strong>ORPO\u7b80\u5355\u5730\u901a\u8fc7\u6dfb\u52a0\u8d1f\u5bf9\u6570\u4f3c\u7136\u635f\u5931\u4e0eOR\u635f\u5931\uff08OR\u4ee3\u8868\u5947\u5f02\u6bd4\uff09\u6765\u4fee\u6539\u8bad\u7ec3\u635f\u5931\uff1a<\/strong><\/p>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-43.png\" alt=\"\" class=\"wp-image-24742\" width=\"375\" height=\"41\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-43.png 528w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-43-300x33.png 300w\" sizes=\"(max-width: 375px) 100vw, 375px\" \/><\/figure>\n\n\n\n<p>OR\u635f\u5931\u5bf9\u88ab\u62d2\u7edd\u7684\u7b54\u6848\u8fdb\u884c\u5f31\u60e9\u7f5a\uff0c\u800c\u5bf9\u9009\u62e9\u7684\u7b54\u6848\u8fdb\u884c\u5f3a\u6709\u529b\u7684\u5956\u52b1\u3002\u8fd9\u91cc\u5305\u542b\u4e86\u4e00\u4e2a\u8d85\u53c2\u6570lambda\u7528\u4e8e\u52a0\u6743OR\u635f\u5931\u3002\u901a\u8fc7ORPO\u7684\u635f\u5931\uff0c\u6a21\u578b\u5728\u5b66\u4e60\u4e86SFT\u671f\u95f4\u7684\u5185\u5bb9\u7684\u540c\u65f6\uff0c\u4e5f\u5b66\u4f1a\u4e86\u4eba\u7c7b\u504f\u597d\u3002<\/p>\n\n\n\n<p>ORPO\u9700\u8981\u6570\u5343\u4e2a\u8bad\u7ec3\u6b65\u9aa4\u6765\u5b66\u4e60\u5982\u4f55\u533a\u5206\u9009\u62e9\u7684\u54cd\u5e94\u548c\u62d2\u7edd\u7684\u54cd\u5e94\u3002\u4e3a\u4e86\u83b7\u5f97\u7c7b\u4f3c\u7684\u7ed3\u679c\uff0c\u5e94\u8be5\u8bad\u7ec3ORPO\u81f3\u5c112000\u6b65\uff0c\u603b\u6279\u5927\u5c0f\u4e3a64(\u5982\u8bba\u6587\u6240\u8ff0)\u3002<\/p>\n\n\n\n<p>ORPO \u5df2\u7ecf\u53ef\u4ee5\u5728Hugging Face\u5e93\u4e0a\u4f7f\u7528\u4e86\uff0c\u5e76\u4e14\u5b83\u56e0\u4e3a\u53ea\u4fee\u6539\u4e86\u635f\u5931\u51fd\u6570\uff0c\u6240\u4ee5\u53ef\u4ee5\u5f88\u597d\u7684\u4e0e\u73b0\u6709\u7684Lora\u65b9\u6cd5\u96c6\u6210<\/p>\n\n\n\n<p>ORPO\u662f\u4e00\u79cd\u5355\u6b65\u5fae\u8c03\u548c\u5bf9\u51c6\u6307\u4ee4llm\u7684\u65b0\u65b9\u6cd5\u3002\u5b83\u4e0d\u9700\u8981\u4efb\u4f55\u5956\u52b1\u6216SFT\u6a21\u578b\uff0c\u5e76\u4e14ORPO\u6bd4DPO\u548cRLHF\u66f4\u7b80\u5355\u3002\u6839\u636e\u8bba\u6587ORPO\u7684\u6027\u80fd\u4e0eDPO\u76f8\u5f53\u6216\u7565\u597d\u3002\u4f46\u662fORPO\u9700\u8981\u51e0\u5343\u4e2a\u8bad\u7ec3\u6b65\u9aa4\u6765\u5b66\u4e60\u597d\u7684\u548c\u574f\u7684\u53cd\u5e94\u4e4b\u95f4\u7684\u533a\u522b\u3002<\/p>\n\n\n\n<h2>SimPO \u7b80\u5355\u504f\u597d\u4f18\u5316\uff1a<\/h2>\n\n\n\n<ul><li><strong>\u8bba\u6587\u6807\u9898\uff1aSimPO: Simple Preference Optimization with a Reference-Free Reward<\/strong><\/li><li><strong>\u8bba\u6587\u5730\u5740\uff1a<a rel=\"noreferrer noopener\" href=\"https:\/\/arxiv.org\/pdf\/2405.14734\" target=\"_blank\">https:\/\/arxiv.org\/pdf\/2405.14734<\/a><\/strong><\/li><li><strong>\u4ee3\u7801 &amp; \u6a21\u578b\uff1a<a rel=\"noreferrer noopener\" href=\"https:\/\/github.com\/princeton-nlp\/SimPO\" target=\"_blank\">https:\/\/github.com\/princeton-nlp\/SimPO<\/a><\/strong><\/li><\/ul>\n\n\n\n<p class=\"has-light-pink-background-color has-background\"><strong>\u7b97\u6cd5\u7684\u6838\u5fc3\u662f\u5c06\u504f\u597d\u4f18\u5316\u76ee\u6807\u4e2d\u7684\u5956\u52b1\u51fd\u6570\u4e0e\u751f\u6210\u6307\u6807\u5bf9\u9f50<\/strong>\uff0c<strong><em>\u4e0d\u9700\u8981ref\u53c2\u8003\u6a21\u578b<\/em><\/strong><\/p>\n\n\n\n<p class=\"has-light-gray-background-color has-background\"><strong>SimPO \u5305\u542b\u4e24\u4e2a\u4e3b\u8981\u7ec4\u4ef6\uff1a\uff081\uff09\u5728\u957f\u5ea6\u4e0a\u5f52\u4e00\u5316\u7684\u5956\u52b1\u3010\/|y|\u3011\uff0c\u5176\u8ba1\u7b97\u65b9\u5f0f\u662f\u4f7f\u7528\u7b56\u7565\u6a21\u578b\u7684\u5956\u52b1\u4e2d\u6240\u6709 token \u7684\u5e73\u5747\u5bf9\u6570\u6982\u7387\uff1b\uff082\uff09\u76ee\u6807\u5956\u52b1\u5dee\u989d  \u03b3 \uff0c\u7528\u4ee5\u786e\u4fdd\u83b7\u80dc\u548c\u5931\u8d25\u54cd\u5e94\u4e4b\u95f4\u7684\u5956\u52b1\u5dee\u8d85\u8fc7\u8fd9\u4e2a\u5dee\u989d \u03b3 \u3002<\/strong><\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"1024\" height=\"226\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-48-1024x226.png\" alt=\"\" class=\"wp-image-24808\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-48-1024x226.png 1024w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-48-300x66.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-48-768x169.png 768w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-48.png 1242w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p>DPO \u662f\u6700\u5e38\u7528\u7684\u79bb\u7ebf\u504f\u597d\u4f18\u5316\u65b9\u6cd5\u4e4b\u4e00\u3002DPO \u5e76\u4e0d\u4f1a\u5b66\u4e60\u4e00\u4e2a\u663e\u5f0f\u7684\u5956\u52b1\u6a21\u578b\uff0c\u800c\u662f\u4f7f\u7528\u4e00\u4e2a\u5e26\u6700\u4f18\u7b56\u7565\u7684\u95ed\u5f0f\u8868\u8fbe\u5f0f\u6765\u5bf9\u5956\u52b1\u51fd\u6570 r \u8fdb\u884c\u91cd\u65b0<mark>\u53c2\u6570<\/mark>\u5316\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-52.png\" alt=\"\" class=\"wp-image-24844\" width=\"523\" height=\"45\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-52.png 816w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-52-300x26.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-52-768x66.png 768w\" sizes=\"(max-width: 523px) 100vw, 523px\" \/><\/figure>\n\n\n\n<p>\u5176\u4e2d \u03c0_\u03b8 \u662f\u7b56\u7565\u6a21\u578b\uff0c\u03c0_ref \u662f\u53c2\u8003\u7b56\u7565\uff08\u901a\u5e38\u662f SFT \u6a21\u578b\uff09\uff0cZ (x) \u662f\u914d\u5206\u51fd\u6570\u3002\u901a\u8fc7\u5c06\u8fd9\u79cd\u5956\u52b1\u6784\u5efa\u65b9\u5f0f\u6574\u5408\u8fdb Bradley-Terry (BT) \u6392\u540d\u76ee\u6807\uff0cDPO \u53ef\u4f7f\u7528\u7b56\u7565\u6a21\u578b\u800c\u975e\u5956\u52b1\u6a21\u578b\u6765\u8868\u793a\u504f\u597d\u6570\u636e\u7684\u6982\u7387\uff0c\u4ece\u800c\u5f97\u5230\u4ee5\u4e0b\u76ee\u6807\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"1024\" height=\"92\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-54-1024x92.png\" alt=\"\" class=\"wp-image-24846\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-54-1024x92.png 1024w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-54-300x27.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-54-768x69.png 768w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-54.png 1077w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p><strong>DPO \u7684\u5956\u52b1\u4e0e\u751f\u6210\u4e4b\u95f4\u7684\u5dee\u5f02<\/strong>\u3002\u4f7f\u7528 (1) \u5f0f\u4f5c\u4e3a\u9690\u5f0f\u7684\u5956\u52b1\u8868\u8fbe\u5f0f\u6709\u4ee5\u4e0b\u7f3a\u70b9\uff1a(1)<strong> \u8bad\u7ec3\u9636\u6bb5\u9700\u8981\u53c2\u8003\u6a21\u578b \u03c0_ref\uff0c\u8fd9\u4f1a\u5e26\u6765\u989d\u5916\u7684\u5185\u5b58\u548c\u8ba1\u7b97\u6210\u672c<\/strong>\uff1b(2) <strong>\u8bad\u7ec3\u9636\u6bb5\u4f18\u5316\u7684\u5956\u52b1\u4e0e\u63a8\u7406\u6240\u7528\u7684\u751f\u6210\u6307\u6807\u4e4b\u95f4\u5b58\u5728\u5dee\u5f02<\/strong>\u3002\u5177\u4f53\u6765\u8bf4\uff0c<strong>\u5728\u751f\u6210\u9636\u6bb5\uff0c\u4f1a\u4f7f\u7528\u7b56\u7565\u6a21\u578b \u03c0_\u03b8 \u751f\u6210\u4e00\u4e2a\u80fd\u8fd1\u4f3c\u6700\u5927\u5316\u5e73\u5747\u5bf9\u6570\u4f3c\u7136\u7684\u5e8f\u5217\uff0c\u5b9a\u4e49\u5982\u4e0b\uff1a<\/strong><\/p>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-56.png\" alt=\"\" class=\"wp-image-24864\" width=\"594\" height=\"53\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-56.png 940w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-56-300x27.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-56-768x69.png 768w\" sizes=\"(max-width: 594px) 100vw, 594px\" \/><\/figure>\n\n\n\n<p>\u5728\u89e3\u7801\u8fc7\u7a0b\u4e2d\u76f4\u63a5\u6700\u5927\u5316\u8be5\u6307\u6807\u662f\u975e\u5e38\u56f0\u96be\u7684\uff0c\u4e3a\u6b64\u53ef\u4ee5\u4f7f\u7528\u591a\u79cd\u89e3\u7801\u7b56\u7565\uff0c\u5982\u8d2a\u5a6a\u89e3\u7801\u3001\u6ce2\u675f\u641c\u7d22\u3001\u6838\u91c7\u6837\u548c top-k \u91c7\u6837\u3002\u6b64\u5916\uff0c\u8be5\u6307\u6807\u901a\u5e38\u7528\u4e8e\u5728<mark>\u8bed\u8a00\u6a21\u578b<\/mark>\u6267\u884c\u591a\u9009\u4efb\u52a1\u65f6\u5bf9\u9009\u9879\u8fdb\u884c\u6392\u540d\u3002\u5728 DPO \u4e2d\uff0c\u5bf9\u4e8e\u4efb\u610f\u4e09\u5143\u7ec4 (x, y_w, y_l)\uff0c\u6ee1\u8db3\u5956\u52b1\u6392\u540d r (x, y_w) &gt; r (x, y_l) \u5e76\u4e0d\u4e00\u5b9a\u610f\u5473\u7740\u6ee1\u8db3\u4f3c\u7136\u6392\u540d\uff1a<\/p>\n\n\n\n<div class=\"wp-block-image\"><figure class=\"aligncenter is-resized\"><img loading=\"lazy\" src=\"https:\/\/image.jiqizhixin.com\/uploads\/editor\/67dfa99e-5b3c-47b3-92bb-b35b99ddd3be\/640.png\" alt=\"\u56fe\u7247\" width=\"189\" height=\"22\"\/><\/figure><\/div>\n\n\n\n<p>\u4e8b\u5b9e\u4e0a\uff0c\u5728\u4f7f\u7528 DPO \u8bad\u7ec3\u65f6\uff0c\u7559\u5b58\u96c6\u4e2d\u5927\u7ea6\u53ea\u6709 50% \u7684\u4e09\u5143\u7ec4\u6ee1\u8db3\u8fd9\u4e2a\u6761\u4ef6\u3002<\/p>\n\n\n\n<p>\u6784\u5efa<strong>\u5728\u957f\u5ea6\u4e0a\u5f52\u4e00\u5316\u7684\u5956\u52b1<\/strong>\u3002\u5f88\u81ea\u7136\u5730\uff0c\u6211\u4eec\u4f1a\u8003\u8651\u4f7f\u7528 (3) \u5f0f\u4e2d\u7684 p_\u03b8 \u6765\u66ff\u6362 DPO \u4e2d\u7684\u5956\u52b1\u6784\u5efa\uff0c\u4f7f\u5176\u4e0e\u5f15\u5bfc\u751f\u6210\u7684\u4f3c\u7136\u6307\u6807\u5bf9\u9f50\u3002\u8fd9\u4f1a\u5f97\u5230\u4e00\u4e2a\u5728\u957f\u5ea6\u4e0a\u5f52\u4e00\u5316\u7684\u5956\u52b1\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"954\" height=\"84\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-57.png\" alt=\"\" class=\"wp-image-24866\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-57.png 954w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-57-300x26.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-57-768x68.png 768w\" sizes=\"(max-width: 954px) 100vw, 954px\" \/><\/figure>\n\n\n\n<p>\u5176\u4e2d \u03b2 \u662f\u63a7\u5236\u5956\u52b1\u5dee\u5f02\u5927\u5c0f\u7684\u5e38\u91cf\u3002\u8be5\u56e2\u961f\u53d1\u73b0\uff0c\u6839\u636e\u54cd\u5e94\u957f\u5ea6\u5bf9\u5956\u52b1\u8fdb\u884c\u5f52\u4e00\u5316\u975e\u5e38\u5173\u952e\uff1b\u4ece\u5956\u52b1\u516c\u5f0f\u4e2d\u79fb\u9664\u957f\u5ea6\u5f52\u4e00\u5316\u9879\u4f1a\u5bfc\u81f4\u6a21\u578b\u503e\u5411\u4e8e\u751f\u6210\u66f4\u957f\u4f46\u8d28\u91cf\u66f4\u4f4e\u7684\u5e8f\u5217\u3002\u8fd9\u6837\u4e00\u6765\uff0c\u6784\u5efa\u7684\u5956\u52b1\u4e2d\u5c31\u65e0\u9700\u53c2\u8003\u6a21\u578b\u4e86\uff0c\u4ece\u800c\u5b9e\u73b0\u6bd4\u4f9d\u8d56\u53c2\u8003\u6a21\u578b\u7684\u7b97\u6cd5\u66f4\u9ad8\u7684\u5185\u5b58\u548c\u8ba1\u7b97\u6548\u7387\u3002<\/p>\n\n\n\n<p><strong>SimPO \u76ee\u6807<\/strong><\/p>\n\n\n\n<p>\u76ee\u6807\u5956\u52b1\u5dee\u989d\u3002\u53e6\u5916\uff0c\u8be5\u56e2\u961f\u8fd8\u4e3a Bradley-Terry \u76ee\u6807\u5f15\u5165\u4e86\u4e00\u4e2a\u76ee\u6807\u5956\u52b1\u5dee\u989d\u9879 \u03b3 &gt; 0\uff0c\u4ee5\u786e\u4fdd<strong>\u83b7\u80dc\u54cd\u5e94\u7684\u5956\u52b1 r (x, y_w) \u8d85\u8fc7\u5931\u8d25\u54cd\u5e94\u7684\u5956\u52b1 r (x, y_l) \u81f3\u5c11 \u03b3<\/strong>\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-58.png\" alt=\"\" class=\"wp-image-24869\" width=\"578\" height=\"40\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-58.png 777w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-58-300x21.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-58-768x54.png 768w\" sizes=\"(max-width: 578px) 100vw, 578px\" \/><\/figure>\n\n\n\n<p>\u4e24\u4e2a\u7c7b\u4e4b\u95f4\u7684\u5dee\u989d\u5df2\u77e5\u4f1a\u5f71\u54cd\u5206\u7c7b\u5668\u7684\u6cdb\u5316\u80fd\u529b\u3002\u5728\u4f7f\u7528\u968f\u673a\u6a21\u578b\u521d\u59cb\u5316\u7684\u6807\u51c6\u8bad\u7ec3\u8bbe\u7f6e\u4e2d\uff0c\u589e\u52a0\u76ee\u6807\u5dee\u989d\u901a\u5e38\u80fd\u63d0\u5347\u6cdb\u5316\u6027\u80fd\u3002\u5728\u504f\u597d\u4f18\u5316\u4e2d\uff0c\u8fd9\u4e24\u4e2a\u7c7b\u522b\u662f\u5355\u4e2a\u8f93\u5165\u7684\u83b7\u80dc\u6216\u5931\u8d25\u54cd\u5e94\u3002<\/p>\n\n\n\n<p>\u5728\u5b9e\u8df5\u4e2d\uff0c\u8be5\u56e2\u961f\u89c2\u5bdf\u5230\u968f\u7740\u76ee\u6807\u5dee\u989d\u589e\u5927\uff0c\u751f\u6210\u8d28\u91cf\u4e00\u5f00\u59cb\u4f1a\u63d0\u5347\uff0c\u4f46\u5f53\u8fd9\u4e2a\u5dee\u989d\u53d8\u5f97\u8fc7\u5927\u65f6\uff0c\u751f\u6210\u8d28\u91cf\u5c31\u4f1a\u4e0b\u964d\u3002DPO \u7684\u4e00\u79cd\u53d8\u4f53 IPO \u4e5f\u6784\u5efa\u4e86\u4e0e SimPO \u7c7b\u4f3c\u7684\u76ee\u6807\u5956\u52b1\u5dee\u989d\uff0c\u4f46\u5176\u6574\u4f53\u76ee\u6807\u7684\u6548\u679c\u4e0d\u53ca SimPO\u3002<\/p>\n\n\n\n<p>\u76ee\u6807\u3002\u6700\u540e\uff0c\u901a\u8fc7\u5c06 (4) \u5f0f\u4ee3\u5165\u5230 (5) \u5f0f\u4e2d\uff0c\u53ef\u4ee5\u5f97\u5230 SimPO \u76ee\u6807\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"1024\" height=\"100\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-59-1024x100.png\" alt=\"\" class=\"wp-image-24875\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-59-1024x100.png 1024w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-59-300x29.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-59-768x75.png 768w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-59.png 1150w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p>\u603b\u7ed3\u8d77\u6765\uff0cSimPO \u91c7\u7528\u4e86\u4e0e\u751f\u6210\u6307\u6807\u76f4\u63a5\u5bf9\u9f50\u7684\u9690\u5f0f\u5956\u52b1\u5f62\u5f0f\uff0c\u4ece\u800c\u6d88\u9664\u4e86\u5bf9\u53c2\u8003\u6a21\u578b\u7684\u9700\u6c42\u3002\u6b64\u5916\uff0c\u5176\u8fd8\u5f15\u5165\u4e86\u4e00\u4e2a\u76ee\u6807\u5956\u52b1\u5dee\u989d \u03b3 \u6765\u5206\u79bb\u83b7\u80dc\u548c\u5931\u8d25\u54cd\u5e94<\/p>\n\n\n\n<h2>KTO\uff1aKahneman-Tversky Optimisation<\/h2>\n\n\n\n<ul><li><strong><em>code: <a href=\"https:\/\/github.com\/huggingface\/trl\/blob\/dcee683d968444179f57bffa5a49a7ec13f57654\/trl\/trainer\/kto_trainer.py\">https:\/\/github.com\/huggingface\/trl\/blob\/dcee683d968444179f57bffa5a49a7ec13f57654\/trl\/trainer\/kto_trainer.py<\/a><\/em><\/strong><\/li><li><a href=\"https:\/\/github.com\/hiyouga\/LLaMA-Factory\/blob\/main\/src\/llamafactory\/train\/kto\/trainer.py\"><em><strong>https:\/\/github.com\/hiyouga\/LLaMA-Factory\/blob\/main\/src\/llamafactory\/train\/kto\/trainer.py<\/strong><\/em><\/a><\/li><li><a rel=\"noreferrer noopener\" href=\"https:\/\/link.zhihu.com\/?target=https%3A\/\/arxiv.org\/pdf\/2402.01306.pdf\" target=\"_blank\"><strong><em>KTO: Model Alignment as Prospect Theoretic Optimization<\/em><\/strong><\/a><\/li><\/ul>\n\n\n\n<p><strong>\u7279\u70b9\uff1a<\/strong><\/p>\n\n\n\n<p class=\"has-light-gray-background-color has-background\"><strong>KTO\u5173\u6ce8\u7684\u662f\u7b54\u6848\u504f\u79bb\u5e73\u5747\u6c34\u51c6\u7684\u7a0b\u5ea6<\/strong>\u2014\u2014<strong>\u6bd4\u5e73\u5747\u597d\u8fd8\u662f\u574f<\/strong>\u3002\u6240\u4ee5<strong>\u5b83\u7684\u8bad\u7ec3\u6570\u636e\u96c6\u662f\u5bf9\u5355\u4e2a\u95ee\u7b54\u7684\u201c\u597d\/\u5dee\u201d\u6807\u6ce8\uff0c\u800c\u4e0d\u518d\u662f\u6210\u5bf9\u6570\u636e\u95f4\u8c01\u597d\u8c01\u5dee\uff08\u6240\u4ee5\u7528\u6237\u5bf9LLM\u7ed3\u679c\u7684\u70b9\u8d5e\u6216\u8e29\u5c31\u53ef\u4ee5\u5f53\u505a\u53cd\u9988\u4f7f\u7528\u4e86\uff09\u3002<\/strong><\/p>\n\n\n\n<p class=\"has-light-pink-background-color has-background\"><strong>KTO\u4e0d\u9700\u8981\u504f\u597d\u6570\u636e\uff0c\u53ef\u4ee5\u76f4\u63a5\u5229\u7528\u4e8c\u5143\u4fe1\u53f7\u6807\u8bb0\u7684\u6570\u636e\u6765\u8bad\u7ec3\u7b97\u6cd5\uff0c\u5bf9\u4e8e\u8d1f\u6837\u672c\u66f4\u52a0\u654f\u611f\u3002<\/strong> <strong>KTO\u5e76\u4e0d\u9700\u8981\u4e00\u4e2a\u6570\u636e\u5bf9\uff0c\u53ea\u9700\u8981\u5bf9\u751f\u6210\u7684\u7ed3\u679c\u8fdb\u884cgood\/bad\u7684\u4e8c\u5143\u6807\u6ce8\u5373\u53ef\u3002<\/strong> \u3010\u6bd4\u5982\uff1aChatGPT UI \u754c\u9762\u4f1a\u8f93\u51fa\u4e24\u4e2a\u7b54\u6848\uff0c\u7528\u6237\u53ef\u4ee5\u9009\u62e9\u54ea\u4e2a\u66f4\u597d\uff0c\u9002\u7528\u4e8e\u4ece\u751f\u4ea7\u73af\u5883\u4e2d\u8fd0\u884c\u7684\u804a\u5929\u6a21\u578b\u7684\u8bad\u7ec3\u3011<\/p>\n\n\n\n<p>\u5b9e\u9a8c\u8868\u660e\uff0cKTO\u7b97\u6cd5\u5728\u4e00\u5b9a\u53c2\u6570\u8303\u56f4\u5185\u80fd\u591f\u8d85\u8fc7DPO\u7b97\u6cd5\uff0c\u5e76\u4e14KTO\u53ef\u4ee5\u5904\u7406\u6570\u636e\u6b63\u8d1f\u6837\u672c\u4e0d\u5e73\u8861\u7684\u60c5\u51b5\u3002\u540c\u65f6\uff0c\u5728\u8df3\u8fc7SFT\u9636\u6bb5\u7684\u60c5\u51b5\u4e0b\uff0c\u76f4\u63a5\u4f7f\u7528KTO\u76f8\u6bd4\u4e8e\u76f4\u63a5\u4f7f\u7528 DPO\uff0c\u6548\u679c\u6709\u5f88\u5927\u63d0\u5347\u3002<strong>\u5728\u6570\u636e\u6b63\u8d1f\u6837\u672c\u6bd4\u4f8b\u5931\u8861\/\u504f\u597d\u6570\u636e\u6709\u975e\u4f20\u9012\u6027\/\u504f\u597d\u6570\u636e\u6709\u566a\u58f0\/\u7684\u60c5\u51b5\u4e0b\uff0c\u4f7f\u7528KTO\u53ef\u80fd\u662f\u66f4\u597d\u7684\u9009\u62e9\u3002<\/strong><\/p>\n\n\n\n<p><code>KTO<\/code>&nbsp;\u4f7f\u7528 Kahneman-Tversky&nbsp;<strong>\u4eba\u7c7b\u6548\u7528\u6a21\u578b<\/strong>\uff0c\u8bba\u6587\u63d0\u51fa\u76f4\u63a5\u6700\u5927\u5316\u751f\u6210\u6548\u7528\u7684&nbsp;<code>HALO<\/code>, \u800c\u4e0d\u662f\u6700\u5927\u5316\u504f\u597d\u7684\u5bf9\u6570\u53ef\u80fd\u6027\u3002<\/p>\n\n\n\n<ul><li>\u57281B~30B\u5c3a\u5ea6\u4e0a\u4e0e\u57fa\u4e8e\u504f\u597d\u7684\u65b9\u6cd5\u7684\u6027\u80fd\u76f8\u5339\u914d\u6216\u8d85\u8fc7\uff0c\u5c3d\u7ba1\u5b83\u53ea\u4ece\u4e8c\u8fdb\u5236\u4fe1\u53f7\uff080\u6216\u80051\uff09\u4e2d\u5b66\u4e60\u8f93\u51fa\u662f\u5426\u53ef\u53d6\u3002<\/li><li>\u6ca1\u6709\u4e00\u4e2a HALO \u666e\u904d\u4f18\u8d8a\uff1b<\/li><li>\u6700\u4f73\u635f\u5931\u53d6\u51b3\u4e8e\u6700\u9002\u5408\u7ed9\u5b9a\u8bbe\u7f6e\u7684\u5f52\u7eb3\u504f\u5dee\uff0c\u7ecf\u5e38\u88ab\u5ffd\u89c6\u7684\u8003\u8651\u56e0\u7d20\u3002<\/li><\/ul>\n\n\n\n<p><strong>KTO\u7b97\u6cd5\u7684\u5177\u4f53\u6b65\u9aa4\u5982\u4e0b\uff1a<\/strong><\/p>\n\n\n\n<ol><li><strong>\u5b9a\u4e49\u6548\u7528\u51fd\u6570<\/strong>\uff1a\u6839\u636e\u524d\u666f\u7406\u8bba\u4e2d\u7684\u6548\u7528\u51fd\u6570\u516c\u5f0f\uff0c\u5b9a\u4e49\u4e00\u4e2a\u6548\u7528\u51fd\u6570\uff0c<strong>\u7528\u4e8e\u8ba1\u7b97\u6a21\u578b\u8f93\u51fa\u76f8\u5bf9\u4e8e\u53c2\u8003\u70b9\u7684\u6548\u7528\u3002<\/strong><\/li><li><strong>\u8ba1\u7b97\u53c2\u8003\u70b9<\/strong>\uff1a\u6839\u636e\u6982\u7387\u5206\u5e03Q(X&#8217;, Y&#8217; | x, y)\uff0c\u8ba1\u7b97\u51fa\u4e00\u4e2a\u53c2\u8003\u70b9\uff0c\u7528\u4e8e\u8861\u91cf\u6a21\u578b\u8f93\u51fa\u7684\u6548\u7528\u3002<\/li><li><strong>\u8ba1\u7b97\u6a21\u578b\u8f93\u51fa\u7684\u6548\u7528<\/strong>\uff1a\u5bf9\u4e8e\u6bcf\u4e2a\u8f93\u5165\uff0c\u8ba1\u7b97\u6a21\u578b\u8f93\u51fa\u76f8\u5bf9\u4e8e\u53c2\u8003\u70b9\u7684\u6536\u76ca\u6216\u635f\u5931\uff0c\u7136\u540e\u4f7f\u7528\u6548\u7528\u51fd\u6570\u8ba1\u7b97\u8fd9\u4e9b\u6536\u76ca\u6216\u635f\u5931\u7684\u6548\u7528\u3002<\/li><li><strong>\u4f18\u5316\u6a21\u578b\u53c2\u6570<\/strong>\uff1a\u4f18\u5316\u6a21\u578b\u53c2\u6570\u4ee5\u6700\u5927\u5316\u6a21\u578b\u8f93\u51fa\u7684\u603b\u6548\u7528\u3002<\/li><\/ol>\n\n\n\n<p>KTO \u635f\u5931\u51fd\u6570\u672c\u8d28\u662f\u628a pair-wise \u516c\u5f0f\u53d8\u6210 point-wise \u65b9\u5f0f\uff0c\u7ed3\u5408\u4e86HALOs\u4ee5\u53ca\u4e8c\u5143\u4fe1\u53f7\u6570\u636e\u7684\u601d\u60f3\u63d0\u51fa\u4f7f\u7528Kahneman-Tversky \u4f18\u5316\u7684KTO\u7b97\u6cd5\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-64.png\" alt=\"\" class=\"wp-image-24915\" width=\"580\" height=\"289\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-64.png 778w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-64-300x150.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-64-768x383.png 768w\" sizes=\"(max-width: 580px) 100vw, 580px\" \/><\/figure>\n\n\n\n<p>\u5176\u4e2d&nbsp;zo\u662fKL\u6563\u5ea6\u9879\uff0c\u53c2\u8003\u70b9zo\u4e3a\u6700\u4f18\u7b56\u7565\u4e0breward\u7684<strong>\u671f\u671b\u503c<\/strong>\uff0c\u6700\u7ec8\u53ef\u4ee5\u63a8\u5bfc\u6210KL\u6563\u5ea6\u7684\u5f62\u5f0f\uff0cy&#8217;\u8868\u793a\u4efb\u610f\u8f93\u51fa\uff0c\u5728\u5b9e\u9645\u8bad\u7ec3\u4e2d\uff0c<strong>Z<sub>0<\/sub>\u8868\u793abatch\u5e73\u5747\u6c34\u51c6\u7684\u7a0b\u5ea6<\/strong>\u3010<strong>Z<sub>0<\/sub>\u4ece<\/strong><strong style=\"font-size: revert;\">\u5f53\u524dbatch\u91cc\u9762\u7684\u6837\u672c\u8fdb\u884c\u4f30\u8ba1\u5f97\u5230\u7684<\/strong><span style=\"font-size: revert;\">\u3011<\/span>\uff0c\u5e73\u5747 reward\uff0c\u4ee3\u8868\u4e0d\u597d\u4e0d\u574f\u7684\u5c45\u4e2d\u7684\u7ed3\u679c\u3002&nbsp;LKTO&nbsp;\u5c31\u662fDPO\u4e2d\u63a8\u5bfc\u7684reward\u51fd\u6570\u5f62\u5f0f\u3002<\/p>\n\n\n\n<p>\u6309\u7167\u4e0a\u9762\u7684\u5b9a\u4e49\u4f30\u8ba1z0\u662f\u4e0d\u5207\u5b9e\u9645\u7684\uff0c\u56e0\u4e3a\u4ece\u03c0\u03b8\u91c7\u6837\u5f88\u6162\uff0c\u4eba\u7c7b\u65e0\u6cd5\u611f\u77e5\u03c0\u03b8\u5f15\u8d77\u7684\u5b8c\u6574\u5206\u5e03\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-67.png\" alt=\"\" class=\"wp-image-24995\" width=\"526\" height=\"313\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-67.png 526w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/02\/image-67-300x179.png 300w\" sizes=\"(max-width: 526px) 100vw, 526px\" \/><\/figure>\n\n\n\n<p>\u8fd9\u4e2a\u4f30\u8ba1\u662f\u6709\u504f\u5dee\u7684\uff0c\u4f46\u8fd9\u662f\u53ef\u53d6\u7684\uff0c\u56e0\u4e3a\u5b83\u4f7f\u6211\u4eec\u66f4\u63a5\u8fd1\u4eba\u7c7b\u5982\u4f55\u6784\u5efa\u4ed6\u4eec\u7684\u4e3b\u89c2\u53c2\u8003\u70b9\u3002<\/p>\n\n\n\n<p><strong>\u5b9e\u9645\u4e0aKTO\u76f8\u5bf9\u6bd4DPO\u5dee\u5f02\u5c31\u4e24\u70b9<\/strong><\/p>\n\n\n\n<ul><li><strong>\u5bf9\u6b63\u8d1f\u6837\u672c\u8fdb\u884c\u4e86\u52a0\u6743<\/strong>\uff1aDPO\u91cc\u9762\u662f\u4f7f\u7528<strong>\u6b63\u8d1f\u6837\u672c\u7684reward\u5dee\u503c<\/strong>\u8fdb\u884csigmoid\u6620\u5c04\uff0c\u4f46\u662fKTO\u91cc\u9762\u4f7f<strong>\u7528reward\u6a21\u578b\u4e0eKL\u6563\u5ea6\u4e4b\u95f4\u7684\u5dee\u5f02<\/strong>\uff01\uff08<strong>\u8bf4\u662fKL\u6563\u5ea6\uff0c\u4f46\u5176\u5b9e\u4e5f\u662fbad\u7684log\u6bd4\u503c\u6570\u503c\uff01\u4e0d\u8fc7\u4e0d\u662f\u540c\u4e00\u4e2apair<\/strong>\uff09<\/li><li>\u6ce8\u610f\uff1a\u5728\u5b9e\u8df5\u7684\u65f6\u5019\uff0c<strong>KL\u9879\u5e76\u4e0d\u53c2\u4e0e\u53cd\u5411\u4f20\u64ad<\/strong>\uff0c\u8fd9\u5176\u5b9e\u5c31\u8ddfDPO\u66f4\u76f8\u4f3c\u7684\u3002DPO\u4f7f\u4e00\u4e2a\u6570\u636e\u5bf9\uff0c\u4f46\u662f\u8fd9\u91cc\u628a<strong>DPO\u7ed9\u62c6\u5206\u4e86\uff0c\u76f8\u5f53\u4e8e\u5bf9\u6bcf\u4e00\u4e2a\u6837\u672c\u5355\u72ec\u8fdb\u884c\u6700\u5927\u5316\u6216\u6700\u5c0f\u5316\u4e86\uff0c\u4ee5\u53ca\u8fdb\u884c\u52a0\u6743\u3002<\/strong>\u53e6\u4e00\u4e2a\u4f5c\u7528\u5c31\u662f\uff0c\u5982\u679c&nbsp;rKTO(x,y)&nbsp;\u7684\u5dee\u5f02\u4e0eKL\u6563\u5ea6\u6709\u8db3\u591f\u533a\u522b\u7684\u8bdd\uff0c\u90a3\u5bf9\u5e94\u7684Loss\u4e5f\u5c31\u6bd4\u8f83\u5c0f\u3002\u56e0\u6b64\uff0c<strong>KTO\u4f1a\u66f4\u52a0\u9f13\u52b1\u5dee\u5f02\u5927\u7684\u6570\u636e\u5bf9\u3002<\/strong><\/li><\/ul>\n\n\n\n<ul><li>\u4f46\u5176\u5b9e\u6211\u4eec\u53ef\u4ee5\u4eceKTO\u7684\u76ee\u6807\u51fd\u6570\u76f4\u63a5\u770b\u5230\u3002\u7531\u4e8eKTO\u662f\u5206\u522b\u9488\u5bf9\u5355\u6761\u6570\u636e\uff0c\u5982\u679c\u6570\u636e\u662f\u6b63\u6837\u672c\uff0c\u90a3\u4e48\u4e00\u5b9a\u8981\u8d85\u8fc7&nbsp;zo&nbsp;\u624d\u4f1a\u4ea7\u751f\u9884\u6d4b\u6b63\u786e\u53cd\u9988\uff1b\u5bf9\u4e8e\u8d1f\u6837\u672c\uff0c\u9700\u8981\u4f4e\u4e8e&nbsp;zo\u624d\u4f1a\u4ea7\u751f\u9884\u6d4b\u6b63\u786e\u53cd\u9988<\/li><\/ul>\n\n\n\n<p id=\"h_695992165_10\"><strong>KTO\u548cDPO\u7684\u9009\u62e9<\/strong>\uff1a<\/p>\n\n\n\n<ul><li><strong>\u6570\u636e\u6bd4\u4f8b<\/strong>\uff1a\u5982\u679c\u6570\u636e\u96c6\u662f\u4ee5good\/bad\u5f62\u5f0f\u8fdb\u884c\u6807\u6ce8\uff0c\u5e76\u4e14\u6570\u636e\u6bd4\u4f8b\u4e0d\u5e73\u8861\uff0c\u90a3\u4e48\u9009\u62e9KTO<\/li><\/ul>\n\n\n\n<ul><li><strong>\u6570\u636e\u8d28\u91cf<\/strong>\uff1a\u5982\u679c\u4f60\u7684\u504f\u597d\u6570\u636e\u8d28\u91cf\u9ad8\uff0c\u6570\u636e\u566a\u58f0\u5c0f\uff0c\u90a3\u4e48DPO\u7684\u6548\u679c\u66f4\u597d\u3002\u7531\u4e8e\u76ee\u524d\u516c\u5f00\u7684\u6570\u636e\u96c6\u4e2d\u5b58\u5728\u7684\u566a\u58f0\u8f83\u5927\uff0c\u8fd9\u5c31\u80fd\u89e3\u91ca\u4e3a\u4ec0\u4e48KTO\u7684\u6548\u679c\u4f1a\u8d85\u8fc7DPO\u4e86\u3002<\/li><li><strong>\u7406\u8bba\u5206\u6790<\/strong>\uff1aKTO\u4e0d\u4f1a\u4ece\u8d1f\u6837\u672c\u4e2d\u5b66\u4e60\u5230\u5f88\u9ad8\u7684\u53cd\u9988\uff0c\u4e5f\u4e0d\u4f1a\u4ece\u6b63\u6837\u672c\u4e2d\u5b66\u4e60\u5230\u5f88\u4f4e\u7684\u53cd\u9988\uff08\u6240\u4ee5\u5bf9\u566a\u58f0\u6bd4\u8f83\u9c81\u68d2\uff09<\/li><\/ul>\n\n\n\n<p><strong>KTO \u7684\u5de5\u4f5c\u539f\u7406\uff1a<\/strong><\/p>\n\n\n\n<ul><li>\u5982\u679c\u6a21\u578b\u4ee5\u76f4\u63a5(blunt manner)\u65b9\u5f0f\u589e\u52a0\u4e86\u7406\u60f3\u793a\u4f8b\u7684\u5956\u52b1\uff0c\u90a3\u4e48 KL \u60e9\u7f5a\u4e5f\u4f1a\u589e\u52a0\uff0c\u5e76\u4e14\u4e0d\u4f1a\u53d6\u5f97\u4efb\u4f55\u8fdb\u6b65\u3002\u8fd9\u8feb<strong>\u4f7f\u6a21\u578b\u51c6\u786e\u5730\u4e86\u89e3\u662f\u4ec0\u4e48\u8ba9\u8f93\u51fa\u53d8\u5f97\u7406\u60f3\uff0c\u8fd9\u6837\u5c31\u53ef\u4ee5\u589e\u52a0\u5956\u52b1\uff0c\u540c\u65f6\u4fdd\u6301 KL \u9879\u6301\u5e73\uff08\u751a\u81f3\u51cf\u5c11\uff09\u3002<\/strong><\/li><li>\u5b9e\u9645\u5b9e\u73b0\u4e2d\uff0c<strong>KL \u9879\u662f\u901a\u8fc7\u5f53\u524dbatch\u91cc\u9762\u7684\u6b63\u8d1f\u6837\u672c\u8fdb\u884c\u4f30\u8ba1\u5f97\u5230\u7684<\/strong>\u3010<strong>\u53ef\u4ee5\u8ba4\u4e3a\u662fbatch\u6837\u672c\u7684\u5e73\u5747\u6c34\u5e73<\/strong>\u3011\uff0c\u8be6\u7ec6 debug&nbsp;<a rel=\"noreferrer noopener\" href=\"https:\/\/github.com\/huggingface\/trl\/blob\/dcee683d968444179f57bffa5a49a7ec13f57654\/trl\/trainer\/kto_trainer.py#L852\" target=\"_blank\">KTOTrainer \u6e90\u4ee3\u7801<\/a><\/li><\/ul>\n\n\n\n<p><strong>\u5bf9\u6210\u5bf9\u504f\u597d\u6570\u636e\u8fdb\u884c\u5206\u914d<\/strong>\uff1a<\/p>\n\n\n\n<ul><li>\u4e0e\u5927\u591a\u6570\u6bd4\u5bf9\u65b9\u6cd5\u4e00\u6837\uff0cDPO \u9700\u8981\u4e00\u4e2a\u6210\u5bf9\u504f\u597d\u6570\u636e\u96c6(x, y_w, y_l)\uff0c\u591f\u6839\u636e\u4e00\u7ec4\u6807\u51c6\uff08\u5982\u6709\u76ca\u6027\u6216\u6709\u5bb3\u6027\uff09\u6765\u6807\u8bb0\u54ea\u79cd\u6a21\u578b\u54cd\u5e94\u66f4\u597d\u3002<\/li><li><strong>\u5b9e\u8df5\u8fc7\u7a0b\u4e2d\uff0c\u521b\u5efa\u8fd9\u4e9b\u6570\u636e\u662f\u4e00\u9879\u8017\u65f6\u4e14\u6210\u672c\u9ad8\u6602\u7684\u5de5\u4f5c\u3002<\/strong><\/li><li>ContextualAI \u63d0\u51fa\u66ff\u4ee3\u65b9\u6848\uff0c\u79f0\u4e3a Kahneman-Taversky \u4f18\u5316\uff08<code>KTO<\/code>\uff09\uff0c<strong>\u5b8c\u5168\u6839\u636e\u88ab\u6807\u8bb0\u4e3a\u300c\u597d\u300d\u6216\u300c\u574f\u300d\u7684\u6837\u672c\uff08\u4f8b\u5982\u5728\u804a\u5929 UI \u4e2d\u770b\u5230\u7684\u56fe\u6807\ud83d\udc4d\u6216\ud83d\udc4e\uff09\u6765\u5b9a\u4e49\u635f\u5931\u51fd\u6570\u3002<\/strong>\u8fd9\u4e9b\u6807\u7b7e\u66f4\u5bb9\u6613\u83b7\u5f97, KTO \u662f\u4e00\u79cd\u5f88\u6709\u524d\u666f\u7684\u65b9\u6cd5\uff0c\u4e0d\u65ad\u66f4\u65b0\u5728\u751f\u4ea7\u73af\u5883\u4e2d\u8fd0\u884c\u7684\u804a\u5929\u6a21\u578b\u3002<\/li><\/ul>\n\n\n\n<p>\u4e0e\u6b64\u540c\u65f6\uff0c\u8fd9\u4e9b\u65b9\u6cd5\u90fd\u6709\u76f8\u5e94\u7684\u8d85\u53c2\u6570\uff0c\u5176\u4e2d\u6700\u91cd\u8981\u7684\u662f \u03b2 \uff0c\u63a7\u5236\u5bf9\u4f7f\u7528\u6a21\u578b\u7684\u504f\u597d\u7a0b\u5ea6\u7684\u6743\u91cd\u3002\u8fd9\u4e9b\u65b9\u6cd5\u5df2\u7ecf\u5728\u7b2c\u4e09\u65b9\u5e93\uff08\u5982 huggingface TRL\uff09\u4e2d\u5b9e\u73b0<\/p>\n\n\n\n<p><strong>KTO \u6570\u636e\u96c6<\/strong>\uff1a<\/p>\n\n\n\n<p><strong>KTO \u4e0d\u9700\u8981\u6210\u5bf9\u7684\u504f\u597d\u6570\u636e\uff0c\u5b9e\u9a8c\u65f6\u76f4\u63a5\u5c06 GPT-4 \u751f\u6210\u7684\u54cd\u5e94\u5f52\u7c7b\u4e3a\u300c\u597d\u300d\u6807\u7b7e\uff0c\u5c06 Llama Chat 13b \u7684\u54cd\u5e94\u89c6\u4e3a\u300c\u574f\u300d\u6807\u7b7e\u3002<\/strong><\/p>\n\n\n\n<p>KTO\u6570\u636e\u96c6\u4e0e\u504f\u597d\u6570\u636e\u96c6\u7c7b\u4f3c\uff0c\u4f46\u4e0d\u540c\u4e8e\u7ed9\u51fa\u4e00\u4e2a\u66f4\u4f18\u7684\u56de\u7b54\u548c\u4e00\u4e2a\u66f4\u5dee\u7684\u56de\u7b54\uff0cKTO\u6570\u636e\u96c6\u5bf9\u6bcf\u4e00\u8f6e\u95ee\u7b54\u53ea\u7ed9\u51fa\u4e00\u4e2a true\/false \u7684&nbsp;<code>label<\/code>\u3002 \u9664\u4e86&nbsp;<code>instruction<\/code>&nbsp;\u4ee5\u53ca&nbsp;<code>input<\/code>&nbsp;\u7ec4\u6210\u7684\u4eba\u7c7b\u6700\u7ec8\u8f93\u5165\u548c\u6a21\u578b\u56de\u7b54&nbsp;<code>output<\/code>&nbsp;\uff0cKTO \u6570\u636e\u96c6\u8fd8\u9700\u8981\u989d\u5916\u6dfb\u52a0\u4e00\u4e2a&nbsp;<code>kto_tag<\/code>&nbsp;\u5217\uff08true\/false\uff09\u6765\u8868\u793a\u4eba\u7c7b\u7684\u53cd\u9988\u3002\u5728\u4e00\u8f6e\u95ee\u7b54\u4e2d\u5176\u683c\u5f0f\u5982\u4e0b\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-preformatted\"><strong>[<\/strong>\n  <strong>{<\/strong>\n    <strong>\"instruction\"<\/strong><strong>:<\/strong> \"\u4eba\u7c7b\u6307\u4ee4\uff08\u5fc5\u586b\uff09\"<strong>,<\/strong>\n    <strong>\"input\"<\/strong><strong>:<\/strong> \"\u4eba\u7c7b\u8f93\u5165\uff08\u9009\u586b\uff09\"<strong>,<\/strong>\n    <strong>\"output\"<\/strong><strong>:<\/strong> \"\u6a21\u578b\u56de\u7b54\uff08\u5fc5\u586b\uff09\"<strong>,<\/strong>\n    <strong>\"kto_tag\"<\/strong><strong>:<\/strong> \"\u4eba\u7c7b\u53cd\u9988 [true\/false]\uff08\u5fc5\u586b\uff09\"\n  <strong>}<\/strong>\n<strong>]<\/strong>\n<\/pre>\n\n\n\n<p>\u5bf9\u4e8e\u4e0a\u8ff0\u683c\u5f0f\u7684\u6570\u636e\uff0c&nbsp;<code>dataset_info.json<\/code>&nbsp;\u4e2d\u7684&nbsp;<strong>\u6570\u636e\u96c6\u63cf\u8ff0<\/strong>&nbsp;\u5e94\u4e3a\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-preformatted\"><strong>\"\u6570\u636e\u96c6\u540d\u79f0\"<\/strong><strong>:<\/strong> <strong>{<\/strong>\n  <strong>\"file_name\"<\/strong><strong>:<\/strong> \"data.json\"<strong>,<\/strong>\n  <strong>\"columns\"<\/strong><strong>:<\/strong> <strong>{<\/strong>\n    <strong>\"prompt\"<\/strong><strong>:<\/strong> \"instruction\"<strong>,<\/strong>\n    <strong>\"query\"<\/strong><strong>:<\/strong> \"input\"<strong>,<\/strong>\n    <strong>\"response\"<\/strong><strong>:<\/strong> \"output\"<strong>,<\/strong>\n    <strong>\"kto_tag\"<\/strong><strong>:<\/strong> \"kto_tag\"\n  <strong>}<\/strong>\n<strong>}<\/strong><\/pre>\n\n\n\n<h1>\u4ee3\u7801\u5b9e\u73b0\uff1a<\/h1>\n\n\n\n<p><strong>\u57fa\u4e8epytorch\u3001deepspeed\u3001transformers<\/strong>\uff0c<strong>\u4ee3\u7801\uff1a<\/strong><\/p>\n\n\n\n<ul><li><strong><a rel=\"noreferrer noopener\" href=\"https:\/\/github.com\/PKU-Alignment\/align-anything\/tree\/main\/align_anything\/trainers\/text_to_text\" target=\"_blank\">https:\/\/github.com\/PKU-Alignment\/align-anything\/tree\/main\/align_anything\/trainers\/text_to_text<\/a><\/strong><\/li><li><a href=\"https:\/\/github.com\/OpenRLHF\/OpenRLHF\/blob\/main\/README_zh.md\"><strong><em>https:\/\/github.com\/OpenRLHF\/OpenRLHF<\/em><\/strong><\/a><\/li><\/ul>\n\n\n\n<h2>sft\u8bad\u7ec3\u4ee3\u7801\uff1a<\/h2>\n\n\n\n<pre class=\"wp-block-code\"><code>def loss(self, sft_batch: SupervisedBatch) -&gt; dict&#091;str, torch.Tensor]:\n    \"\"\"Loss function for supervised finetuning.\"\"\"\n    outputs = self.<strong>model<\/strong>(**self.infer_batch(sft_batch))\n    return {'loss': outputs.loss}\n\ndef train_step(self, sft_batch: SupervisedBatch) -&gt; dict&#091;str, Any]:\n    \"\"\"Performs a single training step.\"\"\"\n    loss = self.loss(sft_batch)&#091;'loss']\n    self.model.<strong>backward<\/strong>(loss)\n    self.model.<strong>step<\/strong>()\n\n    return {\n        'train\/loss': loss.item(),\n        'train\/lr': self.model.optimizer.param_groups&#091;0]&#091;'lr'],\n    }<\/code><\/pre>\n\n\n\n<h2>dpo\u8bad\u7ec3\u4ee3\u7801\uff1a<\/h2>\n\n\n\n<p><a href=\"https:\/\/blog.csdn.net\/weixin_43013480\/article\/details\/141370269\">https:\/\/blog.csdn.net\/weixin_43013480\/article\/details\/141370269<\/a><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code># \u4ece logits\uff08\u672a\u5f52\u4e00\u5316\u7684\u6982\u7387\u5206\u5e03\uff09\u4e2d\uff0c\u63d0\u53d6 labels \u5bf9\u5e94\u7c7b\u522b\u7684\u5bf9\u6570\u6982\u7387\uff08log probabilities\uff09\u3002\ndef gather_log_probabilities(\n    logits: torch.Tensor,  # size = (B, L, V)\n    labels: torch.LongTensor,  # size = (B, L)\n) -&gt; torch.Tensor:  # size = (B, L)\n    \"\"\"Gather log probabilities of the given labels from the logits.\"\"\"\n    log_probs = F.log_softmax(logits, dim=-1)  # size = (B, L, V)\n    gathered_log_probs = torch.gather(  # size = (B, L, 1)\n        log_probs,\n        dim=-1,\n        index=labels.unsqueeze(dim=-1).to(torch.int64),\n    )\n    return gathered_log_probs.squeeze(dim=-1)  # size = (B, L)<\/code><\/pre>\n\n\n\n<pre class=\"wp-block-code\"><code>def compute_log_probs(\n    self,\n    model: AutoModelForCausalLM,\n    batch: PreferenceBatch,\n) -&gt; torch.Tensor:\n    \"\"\"Compute log probabilities of given sequences.\"\"\"\n    # \u83b7\u5f97\u6240\u6709\u53ef\u80fd\u8f93\u51fa\u7684log\u6982\u7387,logits \u8868\u793a\u6bcf\u4e2a token \u4f4d\u7f6e\u7684 \u672a\u5f52\u4e00\u5316\u7684\u6982\u7387\u5206\u5e03\n    logits = <strong>model(**self.infer_batch(batch)).logits<\/strong>\n    device = logits.device\n    input_ids = batch&#091;'input_ids']\n    #\u53d6\u5f97\u6bcf\u4e2a\u6837\u672c\u7684\u56de\u590d\u957f\u5ea6\uff0c\u7528\u4e8e\u622a\u53d6\u6a21\u578b\u8f93\u51fa\n    batch_size = len(batch&#091;'meta_info']&#091;'response_lens'])\n    logprob_list = &#091;]\n    for idx in range(batch_size):\n        response_length = batch&#091;'meta_info']&#091;'response_lens']&#091;idx]\n        # \u53bb\u9664\u586b\u5145 (PAD) token\uff0c\u907f\u514d\u8ba1\u7b97\u65e0\u6548 token \u7684\u6982\u7387\u3002\n        raw_input_id = strip_pad(input_ids&#091;idx], self.tokenizer.pad_token_id)\n        #\u53ea\u4fdd\u7559 \u56de\u590d\u90e8\u5206\u7684 logits\uff0c\u4e22\u5f03 prompt \u90e8\u5206\u3002 \n        logit = logits&#091;idx]&#091;-response_length:].unsqueeze(0)\n        input_id = raw_input_id&#091;-response_length:].unsqueeze(0)\n        #\u8ba1\u7b97\u5bf9\u5e94\u7684better \u548cworse \u5e8f\u5217token \u5bf9\u6570\u6982\u7387\n        log_p = gather_log_probabilities(logit&#091;:, :-1], input_id&#091;:, 1:])\n        logprob_list.append(log_p.squeeze(0))\n    # \u4e0d\u540c\u6837\u672c\u7684 log_probs \u957f\u5ea6\u4e0d\u540c\uff0c\u4f7f\u7528 pad_sequence \u8fdb\u884c padding\uff0c\u8865\u9f50\u5230\u76f8\u540c\u957f\u5ea6\u3002\n    return torch.nn.utils.rnn.pad_sequence(\n        logprob_list, batch_first=True, padding_value=0.0\n    ).to(device)\n\ndef loss(  # pylint: disable=too-many-locals\n    self,\n    batch: PreferenceBatch,\n) -&gt; dict&#091;str, torch.Tensor]:\n    \"\"\"Loss function for the DPO algorithm.\"\"\"\n   #\u8ba1\u7b97\u5f53\u524d\u6a21\u578b (self.model.module) \u5728 batch \u4e0a\u7684 log \u6982\u7387\u3002\n    sequence_log_probs = self.compute_log_probs(\n        self.model.module,\n        batch,\n    )\n# better_sequence_log_probs (\u7528\u6237\u504f\u597d\u7684\u56de\u590d)\n# worse_sequence_log_probs (\u7528\u6237\u4e0d\u559c\u6b22\u7684\u56de\u590d)\n    (\n        better_sequence_log_probs,  # size = (B, L - 1)\n        worse_sequence_log_probs,  # size = (B, L - 1)\n    ) = sequence_log_probs.chunk(chunks=2, dim=0)\n# \u8ba1\u7b97\u53c2\u8003\u6a21\u578b (self.reference_model.module) \u7684\u5bf9\u6570\u6982\u7387 (log_probs)\u3002\n# reference_model \u901a\u5e38\u662f \u539f\u59cb\u672a\u4f18\u5316\u7684\u6a21\u578b\uff0c\u4f5c\u4e3a\u5bf9\u6bd4\u57fa\u51c6\u3002\n# torch.no_grad() \u8868\u793a \u4e0d\u8ba1\u7b97\u68af\u5ea6\uff0c\u907f\u514d\u5f71\u54cd\u53c2\u8003\u6a21\u578b\u3002\n    with torch.no_grad():\n        ref_sequence_log_probs = self.compute_log_probs(  # size = (2 * B, L - 1)\n            self.reference_model.module,\n            batch,\n        )\n        ref_better_sequence_log_probs, ref_worse_sequence_log_probs = (\n            ref_sequence_log_probs.chunk(chunks=2, dim=0)\n        )\n\n    losses = &#091;]\n    better_sample_rewards = &#091;]\n    worse_sample_rewards = &#091;]\n\n    batch_size = better_sequence_log_probs.size(0)\n    for i in range(batch_size):\n# \u8ba1\u7b97 \u66f4\u597d\/\u66f4\u5dee\u56de\u590d\u7684\u603b log \u6982\u7387\uff08\u5373\u7d2f\u52a0 token \u7ea7\u522b log \u6982\u7387\uff09\u3002\n        better_log_prob = better_sequence_log_probs&#091;i, :].sum(dim=-1)\n        worse_log_prob = worse_sequence_log_probs&#091;i, :].sum(dim=-1)\n        ref_better_log_prob = ref_better_sequence_log_probs&#091;i, :].sum(dim=-1)\n        ref_worse_log_prob = ref_worse_sequence_log_probs&#091;i, :].sum(dim=-1)\n# \u5f53\u524d\u6a21\u578b\u6bd4\u53c2\u8003\u6a21\u578b\u66f4\u504f\u597d better \u56de\u590d \u7684\u7a0b\u5ea6\u3002\n        better_log_ratio = better_log_prob - ref_better_log_prob\n# \u5f53\u524d\u6a21\u578b\u6bd4\u53c2\u8003\u6a21\u578b\u66f4\u504f\u597d worse \u56de\u590d \u7684\u7a0b\u5ea6\u3002\n        worse_log_ratio = worse_log_prob - ref_worse_log_prob\n# \u8ba1\u7b97 better \u548c worse \u7684 log \u6bd4\u503c\u5dee\n# \u4f7f\u7528 -logsigmoid(x) \u8ba1\u7b97\u8d1f\u5bf9\u6570 sigmoid \u635f\u5931\uff0c\u4f18\u5316\u6a21\u578b\u4f7f\u5176\u66f4\u503e\u5411 better \u56de\u590d\u3002\n# logsigmoid \u7684\u6027\u8d28\uff1a\n# \u5982\u679c x \u5f88\u5927\uff0clogsigmoid(x) \u2248 0\uff0c\u610f\u5473\u7740\u635f\u5931\u5c0f\uff0c\u6a21\u578b\u5df2\u7ecf\u6b63\u786e\u504f\u597d better response\u3002\n# \u5982\u679c x \u5f88\u5c0f\u6216\u8d1f\uff0clogsigmoid(x) \u2248 x\uff0c\u610f\u5473\u7740\u635f\u5931\u5927\uff0c\u6a21\u578b\u6ca1\u6709\u6b63\u786e\u533a\u5206 better \u548c worse\uff0c\u9700\u8981\u4f18\u5316\u3002\n        losses.append(\n            -F.logsigmoid(\n                self.cfgs.train_cfgs.scale_coeff * (better_log_ratio - worse_log_ratio),\n            ),\n        )\n        better_sample_rewards.append(\n            self.cfgs.train_cfgs.scale_coeff * better_log_ratio.detach(),\n        )\n        worse_sample_rewards.append(self.cfgs.train_cfgs.scale_coeff * worse_log_ratio.detach())\n    loss = torch.stack(losses).mean()  # size = ()\n    better_sample_reward = torch.stack(better_sample_rewards)  # size = (B,)\n    worse_sample_reward = torch.stack(worse_sample_rewards)  # size = (B,)\n# \u8ba1\u7b97 \u5956\u52b1 (reward)\u3001\u51c6\u786e\u7387 (accuracy) \u548c\u5956\u52b1\u95f4\u8ddd (margin)\u3002\n    reward = better_sample_reward + worse_sample_reward  # size = (B,)\n    reward_accuracy = (better_sample_reward &gt; worse_sample_reward).float().mean()  # size = ()\n    reward_margin = better_sample_reward - worse_sample_reward  # size = (B,)\n\n    return {\n        'loss': loss,\n        'reward': reward,\n        'better_sample_reward': better_sample_reward,\n        'worse_sample_reward': worse_sample_reward,\n        'reward_accuracy': reward_accuracy,\n        'reward_margin': reward_margin,\n    }\n\ndef train_step(\n    self,\n    batch: PreferenceBatch,\n) -&gt; dict&#091;str, Any]:\n    \"\"\"Perform a single training step for DPO.\"\"\"\n    loss_dict = self.loss(batch=batch)\n    loss = loss_dict&#091;'loss']\n    self.model.backward(loss)\n    self.model.step()\n\n    with torch.no_grad():\n        reward = loss_dict&#091;'reward'].mean()\n        better_sample_reward = loss_dict&#091;'better_sample_reward'].mean()\n        worse_sample_reward = loss_dict&#091;'worse_sample_reward'].mean()\n        reward_accuracy = loss_dict&#091;'reward_accuracy']\n        reward_margin = loss_dict&#091;'reward_margin'].mean()\n\n        loss = get_all_reduce_mean(loss)\n        reward = get_all_reduce_mean(reward)\n        better_sample_reward = get_all_reduce_mean(better_sample_reward)\n        worse_sample_reward = get_all_reduce_mean(worse_sample_reward)\n        reward_accuracy = get_all_reduce_mean(reward_accuracy)\n        reward_margin = get_all_reduce_mean(reward_margin)\n\n    return {\n        'train\/loss': loss.item(),\n        'train\/reward': reward.item(),\n        'train\/better_sample_reward': better_sample_reward.item(),\n        'train\/worse_sample_reward': worse_sample_reward.item(),\n        'train\/reward_accuracy': reward_accuracy.item(),\n        'train\/reward_margin': reward_margin.item(),\n        'train\/lr': self.model.optimizer.param_groups&#091;0]&#091;'lr'],\n    }<\/code><\/pre>\n\n\n\n<h2>ppo\u8bad\u7ec3\u4ee3\u7801\uff1a<\/h2>\n\n\n\n<pre class=\"wp-block-preformatted\">#\u4f7f\u7528\u7b56\u7565\u6a21\u578b (Actor Model) \u751f\u6210\u6587\u672c\uff0c\u5e76\u8fd4\u56de\u5176 input_ids \u548c attention_mask\u3002\ndef actor_step(self, mini_prompt_only_batch: PromptOnlyBatch) -&gt; dict[str, Any]:\n    infer_batch = self.infer_batch(mini_prompt_only_batch)\n    actor_batch = copy.deepcopy(infer_batch)\n    sequences = self.actor_model.module.generate(\n        **infer_batch,\n        generation_config=self.generation_config,\n        synced_gpus=True,\n        do_sample=True,\n    )\n    attention_mask = sequences.not_equal(self.tokenizer.pad_token_id)\n    actor_batch['input_ids'] = sequences\n    actor_batch['attention_mask'] = attention_mask\n\n    return actor_batch<\/pre>\n\n\n\n<pre class=\"wp-block-code\"><code># \u8ba1\u7b97\u5956\u52b1\u503c (reward) \u548c\u5bf9\u6297\u5956\u52b1\u503c (reward_values)\u3002 \ndef reward_model_step(self, actor_batch: PromptOnlyBatch) -&gt; dict&#091;str, Any]:\n        reward_batch = copy.deepcopy(actor_batch)\n        if self.reward_tokenizer is not self.tokenizer:\n            reward_tokenize_output = batch_retokenize(\n                actor_batch&#091;'input_ids'],\n                src_tokenizer=self.tokenizer,\n                dest_tokenizer=self.reward_tokenizer,\n                skip_special_tokens=True,\n                device=self.args.device,\n            )\n            reward_batch&#091;'input_ids'] = reward_tokenize_output&#091;'input_ids']\n            reward_batch&#091;'attention_mask'] = reward_tokenize_output&#091;'attention_mask']\n        reward_infer_batch = self.reward_infer_batch(reward_batch)\n        reward_batch&#091;'reward'] = self.reward_model(**reward_infer_batch).end_scores.squeeze(dim=-1)\n        critic_infer_batch = self.reward_infer_batch(actor_batch)\n        scores = self.reward_critic_model(**critic_infer_batch).scores\n        reward_batch&#091;'reward_values'] = scores.squeeze(dim=-1)&#091;:, :-1]\n\n        return reward_batch<\/code><\/pre>\n\n\n\n<pre class=\"wp-block-code\"><code>#\u51bb\u7ed3\u6a21\u578b\u53c2\u6570\uff0c\u907f\u514d\u5f71\u54cd\u8bad\u7ec3\uff0c\u91c7\u6837\u591a\u4e2a mini-batch\uff0c\u751f\u6210\u6587\u672c\uff0c\u8ba1\u7b97\u5956\u52b1\uff0c\u8ba1\u7b97 log \u6982\u7387 (log_probs)\uff0c\u8ba1\u7b97\u53c2\u8003\u6a21\u578b\u7684 log \u6982\u7387 (ref_log_probs)\n# \u7ecf\u9a8c\u56de\u653e\uff1a\u751f\u6210\u8bad\u7ec3\u6570\u636e\u5e76\u8ba1\u7b97\u6307\u6807\n  @torch.no_grad()\n    def rollout(self, prompt_only_batch: PromptOnlyBatch) -&gt; list&#091;dict&#091;str, Any]]:\n        \"\"\"Rollout a batch of experiences.\"\"\"\n        # freeze the model for rolling out\n        self.set_train(mode=False)\n\n        total_batch_size = prompt_only_batch&#091;'input_ids'].size(0)\n        micro_batch_size = int(self.cfgs.train_cfgs.per_device_train_batch_size)\n        micro_inference_batches = &#091;]\n        micro_training_batches = &#091;]\n        mini_batch = {}\n        for i in range(0, total_batch_size, micro_batch_size):\n\n            mini_batch = {\n                key: prompt_only_batch&#091;key]&#091;i : i + micro_batch_size] for key in prompt_only_batch\n            }\n\n            # actor generation\n            actor_batch = self.actor_step(mini_batch)\n            # reward model and reward critic model scoring\n            reward_batch = self.reward_model_step(actor_batch)\n            # calculate the log probabilities\n            logits = self.actor_model(**actor_batch).logits\n            ref_logits = self.actor_reference_model(**actor_batch).logits\n            log_probs = gather_log_probabilities(logits&#091;:, :-1], actor_batch&#091;'input_ids']&#091;:, 1:])\n            ref_log_probs = gather_log_probabilities(\n                ref_logits&#091;:, :-1], actor_batch&#091;'input_ids']&#091;:, 1:]\n            )\n\n            micro_training_batch = {}\n            micro_training_batch&#091;'prompt_idx'] = mini_batch&#091;'input_ids'].size(-1) - 1\n            micro_training_batch&#091;'log_probs'] = log_probs\n            micro_training_batch&#091;'ref_log_probs'] = ref_log_probs\n            micro_training_batch&#091;'reward'] = reward_batch&#091;'reward']\n            micro_training_batch&#091;'reward_values'] = reward_batch&#091;'reward_values']\n\n            mini_batch&#091;'input_ids'] = reward_batch&#091;'input_ids']\n            mini_batch&#091;'attention_mask'] = actor_batch&#091;'attention_mask']\n            # add rollout results to the batches\n            micro_inference_batches.append(mini_batch)\n            micro_training_batches.append(micro_training_batch)\n\n        # unfreeze the model for training\n        self.set_train()\n\n        return micro_inference_batches, micro_training_batches\n<\/code><\/pre>\n\n\n\n<pre class=\"wp-block-code\"><code>\n#\u8ba1\u7b97\u7b56\u7565\u68af\u5ea6\u635f\u5931\n# \u8ba1\u7b97 PPO \u635f\u5931\u51fd\u6570\uff1a\n# ratios = exp(new_log_probs - old_log_probs)\uff08\u65b0\u65e7\u7b56\u7565\u6bd4\uff09\u3002\n# \u88c1\u526a ratios \u907f\u514d\u7b56\u7565\u5267\u70c8\u53d8\u5316\uff08PPO \u5173\u952e\uff09\u3002\n# return -masked_mean(surrogate, mask)\uff1a\u6700\u5927\u5316\u4f18\u52bf \ud835\udc34\ud835\udc61\n   \ndef actor_loss_fn(\n        self,\n        log_probs: torch.Tensor,  # size = (B, L - S)\n        old_log_probs: torch.Tensor,  # size = (B, L - S)\n        advantages: torch.Tensor,  # size = (B, L - S)\n        mask: torch.BoolTensor,  # size = (B, L - S)\n    ) -&gt; torch.Tensor:  # size = ()\n        # size = (B, L - S)\n        ratios = torch.exp(log_probs - old_log_probs)\n        surrogate1 = advantages * ratios\n        surrogate2 = advantages * torch.clamp(\n            ratios,\n            1.0 - self.clip_range_ratio,\n            1.0 + self.clip_range_ratio,\n        )\n        surrogate = torch.minimum(surrogate1, surrogate2)\n        return -masked_mean(surrogate, mask)  # size = ()\n<\/code><\/pre>\n\n\n\n<pre class=\"wp-block-code\"><code>#  rl_step\u51fd\u6570\u662f\u8bad\u7ec3\u8fc7\u7a0b\u4e2d\u4f7f\u7528\u5f3a\u5316\u5b66\u4e60\uff08RL\uff09\u66f4\u65b0\u7b56\u7565\u7684\u4e00\u6b65\u3002\u5728PPo\u7b97\u6cd5\u4e2d\uff0crl_step\u662f\u7528\u6765\u66f4\u65b0\u7b56\u7565\u7f51\u7edc\uff08actor\uff09\u548c\u4ef7\u503c\u7f51\u7edc\uff08critic\uff09\u7684\u4e00\u90e8\u5206\u3002\u5177\u4f53\u6765\u8bf4\uff0c\u8fd9\u4e2a\u51fd\u6570\u901a\u8fc7\u8ba1\u7b97\u5f3a\u5316\u5b66\u4e60\u635f\u5931\uff08actor loss\u548ccritic loss\uff09\uff0c\u5e76\u901a\u8fc7\u53cd\u5411\u4f20\u64ad\u4f18\u5316\u8fd9\u4e24\u4e2a\u7f51\u7edc\u3002\n# reward_critic_model \u8bc4\u4f30\u5956\u52b1\u51fd\u6570\u7684 \u4ef7\u503c\u4f30\u8ba1\uff0c\u7528\u4e8e\u8ba1\u7b97 \u4f18\u52bf\u51fd\u6570 \ud835\udc34\ud835\udc61\u4e0d\u662f\u76f4\u63a5\u8ba1\u7b97\u5956\u52b1\uff0c\u800c\u662f\u4f30\u7b97\u672a\u6765\u53ef\u80fd\u83b7\u5f97\u7684\u5956\u52b1\u3002\u4e3b\u8981\u7528\u4e8e\u65f6\u95f4\u5dee\u5206\uff08TD learning\uff09\u66f4\u65b0\u7b56\u7565\uff0c\u7c7b\u4f3c\u4e8e \u4ef7\u503c\u51fd\u6570\u3002\n\ndef rl_step(\n        self, inference_batch: dict&#091;str, torch.Tensor], training_batch: dict&#091;str, torch.Tensor]\n    ) -&gt; dict&#091;str, Any]:\n        \"\"\"Perform a single update step with RL loss.\"\"\"\n        old_log_probs = training_batch&#091;'log_probs']\n        ref_log_probs = training_batch&#091;'ref_log_probs']\n        reward = training_batch&#091;'reward']\n        old_reward_values = training_batch&#091;'reward_values']\n        start = training_batch&#091;'prompt_idx']\n\n        input_ids = inference_batch&#091;'input_ids']\n        attention_mask = inference_batch&#091;'attention_mask']\n\n        sequence_mask = attention_mask&#091;:, 1:]\n\n        with torch.no_grad():\n            old_rewards = self.add_kl_divergence_regularization(\n                reward,\n                old_log_probs,\n                ref_log_probs,\n                sequence_mask,\n            )\n            reward_advantages, reward_returns = self.get_advantages_and_returns(\n                old_reward_values,\n                old_rewards,\n                sequence_mask,\n                start,\n            )\n\n        logits = self.actor_model(**inference_batch, use_cache=False).logits\n        log_probs = gather_log_probabilities(logits&#091;:, :-1], input_ids&#091;:, 1:])\n        actor_loss = self.actor_loss_fn(\n            log_probs&#091;:, start:],\n            old_log_probs&#091;:, start:],\n            reward_advantages,\n            sequence_mask&#091;:, start:],\n        )\n        self.actor_model.backward(actor_loss)\n        self.actor_model.step()\n\n        reward_values = self.reward_critic_model(**inference_batch).scores\n        reward_values = reward_values.squeeze(dim=-1)&#091;:, :-1]\n        reward_critic_loss = self.critic_loss_fn(\n            reward_values&#091;:, start:],\n            old_reward_values&#091;:, start:],\n            reward_returns,\n            sequence_mask&#091;:, start:],\n        )\n        self.reward_critic_model.backward(reward_critic_loss)\n        self.reward_critic_model.step()\n\n        with torch.no_grad():\n            mask = sequence_mask&#091;:, start:]\n            kl_divergence = ((old_log_probs - ref_log_probs)&#091;:, start:] * mask).sum(dim=-1).mean()\n            mean_generated_length = mask.sum(dim=-1).float().mean()\n            max_generated_length = mask.sum(dim=-1).float().max()\n\n            reward = reward.mean()\n            reward_with_kl_penalty = (old_rewards&#091;:, start:] * mask).sum(dim=-1).mean()\n            reward_advantage = masked_mean(reward_advantages, mask)\n            reward_return = masked_mean(reward_returns, mask)\n            reward_value = masked_mean(reward_values&#091;:, start:], mask)\n\n            actor_loss = get_all_reduce_mean(actor_loss)\n            reward_critic_loss = get_all_reduce_mean(reward_critic_loss)\n            reward = get_all_reduce_mean(reward)\n            reward_with_kl_penalty = get_all_reduce_mean(reward_with_kl_penalty)\n            reward_advantage = get_all_reduce_mean(reward_advantage)\n            reward_return = get_all_reduce_mean(reward_return)\n            reward_value = get_all_reduce_mean(reward_value)\n            kl_divergence = get_all_reduce_mean(kl_divergence)\n            mean_generated_length = get_all_reduce_mean(mean_generated_length)\n            max_generated_length = get_all_reduce_max(max_generated_length)\n\n        dist.barrier()\n\n        return {\n            'train\/actor_loss': actor_loss.item(),\n            'train\/reward_critic_loss': reward_critic_loss.item(),\n            'train\/reward': reward.item(),\n            'train\/reward_with_kl_penalty': reward_with_kl_penalty.item(),\n            'train\/reward_advantage': reward_advantage.item(),\n            'train\/reward_return': reward_return.item(),\n            'train\/reward_value': reward_value.item(),\n            'train\/kl_divergence': kl_divergence.item(),\n            'train\/actor_lr': self.actor_model.optimizer.param_groups&#091;0]&#091;'lr'],\n            'train\/reward_critic_lr': self.reward_critic_model.optimizer.param_groups&#091;0]&#091;'lr'],\n            'train\/mean_generated_length': mean_generated_length.item(),\n            'train\/max_generated_length': max_generated_length.item(),\n        }\n\n    def ptx_step(self, ptx_batch: dict&#091;str, torch.Tensor]) -&gt; dict&#091;str, Any]:\n        \"\"\"Perform a single update step with PTX loss.\"\"\"\n        ptx_loss = self.actor_model(**self.infer_batch(ptx_batch)).loss\n        self.actor_model.backward(self.ptx_coeff * ptx_loss)\n        self.actor_model.step()\n        ptx_loss = get_all_reduce_mean(ptx_loss)\n        return {\n            'train\/ptx_loss': ptx_loss.item(),\n        }<\/code><\/pre>\n\n\n\n<pre class=\"wp-block-code\"><code>\n    def train(self) -&gt; None:\n        \"\"\"Train the model.\"\"\"\n        self.logger.print('***** Running training *****')\n\n        progress_bar = tqdm(\n            total=self.total_training_steps,\n            desc=f'Training 1\/{self.cfgs.train_cfgs.epochs} epoch',\n            position=0,\n            leave=True,\n            disable=not is_main_process(),\n        )\n\n        if self.cfgs.data_cfgs.eval_datasets:\n            self.logger.print('\\n***** Evaluating at the beginning *****')\n            self.eval()\n\n        num_prompt_only_batches = len(self.prompt_only_dataloader)\n        num_ptx_batches = len(self.ptx_dataloader)\n        num_ptx_replicas = (num_prompt_only_batches + num_ptx_batches - 1) \/\/ num_ptx_batches\n        for epoch in range(int(self.cfgs.train_cfgs.epochs)):\n            for prompt_only_batch, ptx_batch in zip(\n                self.prompt_only_dataloader,\n                itertools.chain.from_iterable(&#091;self.ptx_dataloader] * num_ptx_replicas),\n            ):\n                inference_batches, training_batches = self.rollout(prompt_only_batch)\n\n                if self.use_ptx:\n                    ptx_batches = self.split_ptx_micro_batches(ptx_batch)\n                else:\n                    ptx_batches = &#091;None for _ in range(len(inference_batches))]\n                torch.cuda.empty_cache()\n\n                for _ in range(self.cfgs.train_cfgs.update_iters):\n                    for inference_batch, training_batch, ptx_batch in zip(\n                        inference_batches, training_batches, ptx_batches\n                    ):\n                        rl_info = self.rl_step(inference_batch, training_batch)\n\n                        torch.cuda.empty_cache()\n                        self.logger.log(rl_info, step=self.global_step)\n                        if self.use_ptx:\n                            ptx_info = self.ptx_step(ptx_batch)\n                            torch.cuda.empty_cache()\n                            self.logger.log(ptx_info, step=self.global_step)\n\n                        self.global_step += 1\n                        progress_bar.set_description(\n                            f'Training {epoch + 1}\/{self.cfgs.train_cfgs.epochs} epoch '\n                            f'(reward {rl_info&#091;\"train\/reward\"]:.4f})',\n                        )\n                        progress_bar.update(1)\n\n                        if self.global_step % self.cfgs.logger_cfgs.save_interval == 0:\n                            self.logger.print(f'Saving checkpoint at step {self.global_step} ...')\n                            self.save(tag=self.global_step)\n                            self.logger.print('Checkpoint saved.')\n\n                        if (\n                            self.cfgs.data_cfgs.eval_datasets\n                            and self.cfgs.train_cfgs.eval_strategy == 'steps'\n                            and self.global_step % self.cfgs.train_cfgs.eval_interval == 0\n                        ):\n                            self.logger.print(\n                                f'\\n***** Evaluating at step {self.global_step} *****',\n                            )\n                            self.eval()<\/code><\/pre>\n\n\n\n<h2>RM\u5956\u52b1\u6a21\u578b\u8bad\u7ec3\u4ee3\u7801\uff1a<\/h2>\n\n\n\n<pre class=\"wp-block-code\"><code>    def loss(\n        self,\n        batch: PreferenceBatch,\n    ) -&gt; dict&#091;str, torch.Tensor]:\n        \"\"\"Loss function for the reward model.\"\"\"\n        (\n            better_input_ids,  # size = (B, L)\n            worse_input_ids,  # size = (B, L)\n        ) = batch&#091;\n            'input_ids'\n        ].chunk(chunks=2, dim=0)\n        assert better_input_ids.size(0) == worse_input_ids.size(0), 'batch size mismatch!'\n\n# scores\uff1a\u4e00\u822c\u6765\u8bf4\uff0c\u8fd9\u4ee3\u8868\u6a21\u578b\u5728\u6bcf\u4e2a\u65f6\u95f4\u6b65\u9aa4\uff08\u6216\u8f93\u5165\u5206\u6bb5\uff09\u4e0a\u7684\u5956\u52b1\u5f97\u5206\uff0c\u901a\u5e38\u662f\u4e00\u4e2a\u5f62\u72b6\u4e3a (B, L, 1) \u7684\u5f20\u91cf\uff0c\u5176\u4e2d B \u662f\u6279\u91cf\u5927\u5c0f\uff0cL \u662f\u8f93\u5165\u5e8f\u5217\u7684\u957f\u5ea6\uff0c1 \u662f\u5956\u52b1\u5f97\u5206\u7684\u7ef4\u5ea6\u3002\n#end_scores\uff1a\u901a\u5e38\u8868\u793a\u8f93\u5165\u5e8f\u5217\u7684\u7ed3\u675f\u9636\u6bb5\u7684\u5956\u52b1\u5f97\u5206\uff0c\u8fd9\u53ef\u80fd\u662f\u5728\u6574\u4e2a\u5e8f\u5217\u5904\u7406\u5b8c\u6210\u540e\uff0c\u6a21\u578b\u8ba1\u7b97\u51fa\u7684\u6700\u7ec8\u5956\u52b1\u3002\n        output = self.model(**self.infer_batch(batch))\n        scores = output.scores\n        end_scores = output.end_scores\n        higher_rewards, lower_rewards = scores.squeeze(dim=-1).chunk(chunks=2, dim=0)\n        higher_end_reward, lower_end_reward = end_scores.squeeze(dim=-1).chunk(chunks=2, dim=0)\n\n        loss = -F.logsigmoid(higher_end_reward - lower_end_reward).mean()\n\n        if self.cfgs.train_cfgs.regularization &gt; 0.0:\n            loss = (\n                loss\n                + self.cfgs.train_cfgs.regularization\n                * torch.stack(&#091;lower_end_reward, higher_end_reward]).square().mean()\n            )\n\n        accuracy = (higher_end_reward &gt; lower_end_reward).float().mean()  # size = ()\n        return {\n            'loss': loss,  # size = ()\n            'higher_end_reward': higher_end_reward,  # size = (B,)\n            'lower_end_reward': lower_end_reward,  # size = (B,)\n            'higher_rewards': higher_rewards,  # size = (B, L)\n            'lower_rewards': lower_rewards,  # size = (B, L)\n            'accuracy': accuracy,  # size = ()\n        }\n\n    def train_step(\n        self,\n        batch: PreferenceBatch,\n    ) -&gt; dict&#091;str, Any]:\n        \"\"\"Perform a single training step.\"\"\"\n        loss_dict = self.loss(batch)\n        loss = loss_dict&#091;'loss']\n        self.model.backward(loss)\n        self.model.step()\n\n        accuracy = loss_dict&#091;'accuracy']\n\n        loss = get_all_reduce_mean(loss)\n        accuracy = get_all_reduce_mean(accuracy)\n\n        return {\n            'train\/loss': loss.item(),\n            'train\/accuracy': accuracy.item(),\n            'train\/lr': self.model.optimizer.param_groups&#091;0]&#091;'lr'],\n        }<\/code><\/pre>\n\n\n\n<h2> orpo \u8bad\u7ec3\u4ee3\u7801\uff1a<\/h2>\n\n\n\n<p>\u76f8\u5173\u4ecb\u7ecd\uff1a<a href=\"https:\/\/github.com\/Paul33333\/ORPO  \"><em>https:\/\/github.com\/Paul33333\/ORPO  <\/em><\/a>https:\/\/zhuanlan.zhihu.com\/p\/688583797<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code># \u4ece logits\uff08\u672a\u5f52\u4e00\u5316\u7684\u6982\u7387\u5206\u5e03\uff09\u4e2d\uff0c\u63d0\u53d6 labels \u5bf9\u5e94\u7c7b\u522b\u7684\u5bf9\u6570\u6982\u7387\uff08log probabilities\uff09\u3002\ndef gather_log_probabilities(\n    logits: torch.Tensor,  # size = (B, L, V)\n    labels: torch.LongTensor,  # size = (B, L)\n) -&gt; torch.Tensor:  # size = (B, L)\n    \"\"\"Gather log probabilities of the given labels from the logits.\"\"\"\n    log_probs = F.log_softmax(logits, dim=-1)  # size = (B, L, V)\n    gathered_log_probs = torch.gather(  # size = (B, L, 1)\n        log_probs,\n        dim=-1,\n        index=labels.unsqueeze(dim=-1).to(torch.int64),\n    )\n    return gathered_log_probs.squeeze(dim=-1)  # size = (B, L)\n\n# compute_log_probs \u7684\u4f5c\u7528\u662f\u8ba1\u7b97<strong>\u7ed9\u5b9a\u5e8f\u5217\u7684 log \u6982\u7387<\/strong>\uff08\u5bf9\u6570\u6982\u7387\uff09\uff0c\u4e3b\u8981\u7528\u4e8e\u8bc4\u4f30\u8bed\u8a00\u6a21\u578b\uff08LLM\uff09\u7684\u751f\u6210\u8d28\u91cf\u3002\ndef compute_log_probs(\n        self,\n        model: AutoModelForCausalLM,\n        batch: PreferenceBatch,\n    ) -&gt; torch.Tensor:\n        \"\"\"Compute log probabilities of given sequences.\"\"\"\n        logits = model(**self.infer_batch(batch)).logits\n        device = logits.device\n        input_ids = batch&#091;'input_ids']\n        batch_size = len(batch&#091;'meta_info']&#091;'response_lens'])\n        logprob_list = &#091;]\n        for idx in range(batch_size):\n            response_length = batch&#091;'meta_info']&#091;'response_lens']&#091;idx]  # for the eos token\n            logit = logits&#091;idx]&#091;-response_length:].unsqueeze(0)\n            input_id = input_ids&#091;idx]&#091;-response_length:].unsqueeze(0)\n# logit&#091;:, :-1]\u53d6 response \u90e8\u5206\u7684 logits\uff0c\u53bb\u6389\u6700\u540e\u4e00\u4e2a token\uff08\u56e0\u4e3a logits \u9884\u6d4b\u7684\u662f\u4e0b\u4e00\u4e2a token\uff09input_id&#091;:, 1:]: \u53d6 response \u90e8\u5206\u7684 token IDs\uff0c\u4ece\u7b2c\u4e8c\u4e2a token \u5f00\u59cb\uff08\u56e0\u4e3a log_probs \u8ba1\u7b97\u7684\u662f\u4e0b\u4e00\u4e2a token \u6982\u7387\uff09\u3002\n\u4f5c\u7528\uff1a\u8ba1\u7b97 response \u90e8\u5206\u6bcf\u4e2a token \u7684 log \u6982\u7387\uff08\u5bf9 logit \u7684 softmax \u53d6\u5bf9\u6570\uff09\u3002\n            log_p = gather_log_probabilities(logit&#091;:, :-1], input_id&#091;:, 1:]) \n            logprob_list.append(log_p.squeeze(0))\n#<strong>pad\u586b\u5145\uff0c\u8fd4\u56de\u5f20\u91cf\u5f62\u72b6 (B, max_L_resp)<\/strong>\n        return torch.nn.utils.rnn.pad_sequence(\n            logprob_list, batch_first=True, padding_value=0.0\n        ).to(device)\n<\/code><\/pre>\n\n\n\n<pre class=\"wp-block-code\"><code>class ORPOTrainer(DPOTrainer):\n\n    def loss(  # pylint: disable=too-many-locals\n        self,\n        batch: PreferenceBatch, # size = (2*B, L)\n    ) -&gt; dict&#091;str, torch.Tensor]:\n        \"\"\"Loss function for the ORPO algorithm.\"\"\"\n        sequence_log_probs = self.compute_log_probs(\n            self.model.module,\n            batch,\n        )\n        (\n            better_sequence_log_probs,  # size = (B, L - 1)\n            worse_sequence_log_probs,  # size = (B, L - 1)\n        ) = sequence_log_probs.chunk(chunks=2, dim=0)\n\n        losses = &#091;]\n        better_sample_rewards = &#091;]\n        worse_sample_rewards = &#091;]\n\n        better_input_ids, worse_input_ids = batch&#091;'input_ids'].chunk(chunks=2, dim=0)\n        better_attention_mask, worse_attention_mask = batch&#091;'attention_mask'].chunk(chunks=2, dim=0)\n\n        batch_size = better_input_ids.size(0)\n#diverge_index \u4ee3\u8868 better \u548c worse \u8f93\u5165\u5e8f\u5217\u5f00\u59cb\u4e0d\u540c\u7684\u4f4d\u7f6e\uff1adiverge_index\uff0c\u5373\u5b83\u4e4b\u540e\u7684 token \u662f\u6a21\u578b\u751f\u6210\u7684\u90e8\u5206\u3002\n        for i in range(batch_size):\n            if torch.all(torch.eq(better_input_ids&#091;i], worse_input_ids&#091;i])).item():\n                continue\n            better_end_index = better_attention_mask&#091;i].nonzero()&#091;-1].squeeze().item()\n            worse_end_index = worse_attention_mask&#091;i].nonzero()&#091;-1].squeeze().item()\n            diverge_index = (\n                (better_input_ids&#091;i] != worse_input_ids&#091;i]).nonzero()&#091;0].squeeze().item()\n            )\n            assert 0 &lt;= diverge_index &lt;= better_end_index, 'diverge index is out of range!'\n            assert 0 &lt;= diverge_index &lt;= worse_end_index, 'diverge index is out of range!'\n# better_seq_slice \u548c worse_seq_slice \u53d6\u4ece diverge_index \u5f00\u59cb\u5230\u5e8f\u5217\u7ed3\u675f\u7684\u90e8\u5206\uff08\u5373\u6a21\u578b\u751f\u6210\u7684 token\uff09\u3002\n            better_seq_slice = slice(diverge_index, better_end_index + 1)\n            worse_seq_slice = slice(diverge_index, worse_end_index + 1)\n            better_seq_length = better_end_index + 1\n            worse_seq_length = worse_end_index + 1\n\n            # size = ()\n# better_log_prob: \u8ba1\u7b97 better \u90e8\u5206\u7684\u603b log \u6982\u7387\u3002\n# worse_log_prob: \u8ba1\u7b97 worse \u90e8\u5206\u7684\u603b log \u6982\u7387\u3002\n# \u8ba1\u7b97 \u5bf9\u6570\u6bd4\u7387\uff08log ratio\uff09:\n            better_log_prob = better_sequence_log_probs&#091;i, better_seq_slice].sum(dim=-1)\n            worse_log_prob = worse_sequence_log_probs&#091;i, worse_seq_slice].sum(dim=-1)\n            better_log_ratio = better_log_prob \/ better_seq_length\n            worse_log_ratio = worse_log_prob \/ worse_seq_length\n# \u8ba1\u7b97 ORPO \u7684 odds ratio loss\uff1a\n            log_odds = (better_log_ratio - worse_log_ratio) - (\n                torch.log1p(-torch.exp(better_log_ratio)) - torch.log1p(-torch.exp(worse_log_ratio))\n            )\n#  better \u7684 log \u6982\u7387\u660e\u663e\u9ad8\u4e8e worse\uff0c\u4ece\u800c\u4f18\u5316\u751f\u6210\u7b56\u7565\u3002\n            odds_ratio_loss = -F.logsigmoid(log_odds)\n# \u6700\u7ec8\u635f\u5931\n            sft_loss = -better_log_ratio\n            losses.append(\n                sft_loss + self.cfgs.train_cfgs.scale_coeff * odds_ratio_loss,\n            )\n            better_sample_rewards.append(\n                self.cfgs.train_cfgs.scale_coeff * better_log_ratio.detach(),\n            )\n            worse_sample_rewards.append(self.cfgs.train_cfgs.scale_coeff * worse_log_ratio.detach())\n\n        loss = torch.stack(losses).mean()  # size = ()\n        better_sample_reward = torch.stack(better_sample_rewards)  # size = (B,)\n        worse_sample_reward = torch.stack(worse_sample_rewards)  # size = (B,)\n        reward = better_sample_reward + worse_sample_reward  # size = (B,)\n        reward_accuracy = (better_sample_reward &gt; worse_sample_reward).float().mean()  # size = ()\n        reward_margin = better_sample_reward - worse_sample_reward  # size = (B,)\n\n        return {\n            'loss': loss,\n            'reward': reward,\n            'better_sample_reward': better_sample_reward,\n            'worse_sample_reward': worse_sample_reward,\n            'reward_accuracy': reward_accuracy,\n            'reward_margin': reward_margin,\n        }\n\n\ndef main():\n    # setup distribution training\n    deepspeed.init_distributed()\n    current_device = get_current_device()\n    torch.cuda.set_device(current_device)\n\n    # read default configs from the yaml file\n    task = os.path.join('text_to_text', 'orpo')\n    dict_cfgs, ds_cfgs = read_cfgs(mode='train', task=task)\n\n    # get custom configs from command line\n    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)\n    _, unparsed_args = parser.parse_known_args()\n    keys = &#091;k&#091;2:] for k in unparsed_args&#091;1::2]]\n    values = list(unparsed_args&#091;2::2])\n    unparsed_args = dict(zip(keys, values))\n    for k, v in unparsed_args.items():\n        dict_cfgs = update_dict(dict_cfgs, custom_cfgs_to_dict(k, v))\n\n    # setup training\n    cfgs = dict_to_namedtuple(dict_cfgs)\n    seed_everything(cfgs.train_cfgs.seed)\n\n    # finetune the model\n    trainer = ORPOTrainer(cfgs=cfgs, ds_cfgs=ds_cfgs)\n    trainer.train()\n    trainer.save()<\/code><\/pre>\n\n\n\n<h2>SimPO\u8bad\u7ec3\u4ee3\u7801\uff1a<\/h2>\n\n\n\n<p> <a href=\"https:\/\/blog.csdn.net\/weixin_43013480\/article\/details\/141370269\">https:\/\/blog.csdn.net\/weixin_43013480\/article\/details\/141370269<\/a> <\/p>\n\n\n\n<pre class=\"wp-block-code\"><code># compute_log_probs \u7684\u4f5c\u7528\u662f\u8ba1\u7b97<strong>\u7ed9\u5b9a\u5e8f\u5217\u7684 log \u6982\u7387<\/strong>\uff08\u5bf9\u6570\u6982\u7387\uff09\uff0c\u4e3b\u8981\u7528\u4e8e\u8bc4\u4f30\u8bed\u8a00\u6a21\u578b\uff08LLM\uff09\u7684\u751f\u6210\u8d28\u91cf\u3002\n\ndef compute_log_probs(\n        self,\n        model: AutoModelForCausalLM,\n        batch: PreferenceBatch,\n    ) -&gt; torch.Tensor:\n        \"\"\"Compute log probabilities of given sequences.\"\"\"\n        logits = model(**self.infer_batch(batch)).logits\n        device = logits.device\n        input_ids = batch&#091;'input_ids']\n        batch_size = len(batch&#091;'meta_info']&#091;'response_lens'])\n        logprob_list = &#091;]\n        for idx in range(batch_size):\n            response_length = batch&#091;'meta_info']&#091;'response_lens']&#091;idx]\n            raw_input_id = strip_pad(input_ids&#091;idx], self.tokenizer.pad_token_id)\n            logit = logits&#091;idx]&#091;-response_length:].unsqueeze(0)\n            input_id = raw_input_id&#091;-response_length:].unsqueeze(0)\n            log_p = gather_log_probabilities(logit&#091;:, :-1], input_id&#091;:, 1:])\n            logprob_list.append(log_p.squeeze(0))\n        return torch.nn.utils.rnn.pad_sequence(\n            logprob_list, batch_first=True, padding_value=0.0\n        ).to(device)<\/code><\/pre>\n\n\n\n<pre class=\"wp-block-code\"><code>class SimPOTrainer(DPOTrainer):\n\n    def loss(  # pylint: disable=too-many-locals\n        self,\n        batch: PreferenceBatch,\n    ) -&gt; dict&#091;str, torch.Tensor]:\n        \"\"\"Loss function for the SimPO algorithm.\"\"\"\n        sequence_log_probs = self.compute_log_probs(\n            self.model.module,\n            batch,\n        )\n# \u4f7f\u7528 chunk \u5c06 sequence_log_probs \u6309\u7167\u7b2c0\u7ef4\uff08\u6279\u6b21\u7ef4\u5ea6\uff09\u8fdb\u884c\u5207\u5206\u3002\u5c06\u6279\u6b21\u6570\u636e\u5206\u4e3a\u4e24\u90e8\u5206\uff1a\u4e00\u90e8\u5206\u5bf9\u5e94 \"\u66f4\u597d\"\uff08better_sequence_log_probs\uff09\uff0c\u53e6\u4e00\u90e8\u5206\u5bf9\u5e94 \"\u66f4\u5dee\"\uff08worse_sequence_log_probs\uff09\u3002\u6bcf\u90e8\u5206\u7684\u5927\u5c0f\u4e3a (B, L - 1)\uff0cB \u662f\u6279\u6b21\u5927\u5c0f\uff0cL \u662f\u5e8f\u5217\u957f\u5ea6\u3002  L-1 \u662f\u4e3a\u4e86\u5220\u9664\u6700\u540e\u7684 &lt;eos&gt;\n        (\n            better_sequence_log_probs,  # size = (B, L - 1)\n            worse_sequence_log_probs,  # size = (B, L - 1)\n        ) = sequence_log_probs.chunk(chunks=2, dim=0)\n\n        losses = &#091;]\n        better_sample_rewards = &#091;]\n        worse_sample_rewards = &#091;]\n\n        better_input_ids, worse_input_ids = batch&#091;'input_ids'].chunk(chunks=2, dim=0)\n        better_attention_mask, worse_attention_mask = batch&#091;'attention_mask'].chunk(chunks=2, dim=0)\n\n        batch_size = better_input_ids.size(0)\n        for i in range(batch_size):\n#\u68c0\u67e5\u5f53\u524d\u6837\u672c\u7684 \"\u66f4\u597d\" \u548c \"\u66f4\u5dee\" \u90e8\u5206\u7684 input_ids \u662f\u5426\u76f8\u540c\u3002\u5982\u679c\u76f8\u540c\uff0c\u8df3\u8fc7\u8fd9\u4e2a\u6837\u672c\uff0c\u56e0\u4e3a\u5b83\u4eec\u5bf9\u6bd4\u4e0d\u51fa\u5dee\u5f02\u3002\n            if torch.all(torch.eq(better_input_ids&#091;i], worse_input_ids&#091;i])).item():\n                continue\n\n#\u5206\u522b\u8ba1\u7b97 \"\u66f4\u597d\" \u548c \"\u66f4\u5dee\" \u6837\u672c\u7684\u7ed3\u675f\u4f4d\u7f6e\uff08\u901a\u8fc7 attention_mask \u4e2d\u7684\u975e\u96f6\u5143\u7d20\u4f4d\u7f6e\u6765\u786e\u5b9a\uff09\u3002\n            better_end_index = better_attention_mask&#091;i].nonzero()&#091;-1].squeeze().item()\n            worse_end_index = worse_attention_mask&#091;i].nonzero()&#091;-1].squeeze().item()\n            better_input_length = better_end_index + 1\n            worse_input_length = worse_end_index + 1\n# diverge_index \u662f \"\u66f4\u597d\" \u548c \"\u66f4\u5dee\" \u6837\u672c\u4e4b\u95f4\u7684\u7b2c\u4e00\u4e2a\u5dee\u5f02\u4f4d\u7f6e\u3002\n            diverge_index = (\n                (better_input_ids&#091;i] != worse_input_ids&#091;i]).nonzero()&#091;0].squeeze().item()\n            )\n            assert 0 &lt;= diverge_index &lt;= better_end_index, 'diverge index is out of range!'\n            assert 0 &lt;= diverge_index &lt;= worse_end_index, 'diverge index is out of range!'\n#\u6839\u636e diverge_index \u8fdb\u884c\u5207\u7247\uff0c\u83b7\u53d6\u5dee\u5f02\u533a\u57df\u7684\u5bf9\u6570\u6982\u7387\u3002\n#better_log_prob \u548c worse_log_prob \u662f\u5bf9\u5e94\u4e8e \"\u66f4\u597d\" \u548c \"\u66f4\u5dee\" \u6837\u672c\u7684\u5bf9\u6570\u6982\u7387\u7684\u603b\u548c\u3002\n            better_seq_slice = slice(diverge_index, better_end_index + 1)\n            worse_seq_slice = slice(diverge_index, worse_end_index + 1)\n# \u8ba1\u7b97\u635f\u5931\u548c\u5956\u52b1\n            better_log_prob = better_sequence_log_probs&#091;i, better_seq_slice].sum(dim=-1)\n            worse_log_prob = worse_sequence_log_probs&#091;i, worse_seq_slice].sum(dim=-1)\n#\u5728\u957f\u5ea6\u4e0a\u5f52\u4e00\u5316\u7684\u5956\u52b1\u3010\/|y|\u3011\uff0c\u5176\u8ba1\u7b97\u65b9\u5f0f\u662f\u4f7f\u7528\u7b56\u7565\u6a21\u578b\u7684\u5956\u52b1\u4e2d\u6240\u6709 token \u7684\u5e73\u5747\u5bf9\u6570\u6982\u7387\uff1b\n            better_log_ratio = better_log_prob \/ better_input_length\n            worse_log_ratio = worse_log_prob \/ worse_input_length\n#\u76ee\u6807\u5956\u52b1\u5dee\u989d\u03b3\uff0c\u7528\u4ee5\u786e\u4fdd\u83b7\u80dc\u548c\u5931\u8d25\u54cd\u5e94\u4e4b\u95f4\u7684\u5956\u52b1\u5dee\u8d85\u8fc7\u8fd9\u4e2a\u5dee\u989d\u03b3\n            losses.append(\n                -F.logsigmoid(\n                    self.cfgs.train_cfgs.scale_coeff * (better_log_ratio - worse_log_ratio)\n                    - self.cfgs.train_cfgs.gamma,\n                ),\n            )\n            better_sample_rewards.append(\n                self.cfgs.train_cfgs.scale_coeff * better_log_ratio.detach(),\n            )\n            worse_sample_rewards.append(self.cfgs.train_cfgs.scale_coeff * worse_log_ratio.detach())\n        loss = torch.stack(losses).mean()  # size = ()\n        better_sample_reward = torch.stack(better_sample_rewards)  # size = (B,)\n        worse_sample_reward = torch.stack(worse_sample_rewards)  # size = (B,)\n        reward = better_sample_reward + worse_sample_reward  # size = (B,)\n        reward_accuracy = (better_sample_reward &gt; worse_sample_reward).float().mean()  # size = ()\n        reward_margin = better_sample_reward - worse_sample_reward  # size = (B,)\n\n        return {\n            'loss': loss,\n            'reward': reward,\n            'better_sample_reward': better_sample_reward,\n            'worse_sample_reward': worse_sample_reward,\n            'reward_accuracy': reward_accuracy,\n            'reward_margin': reward_margin,\n        }<\/code><\/pre>\n\n\n\n<h2>KTO\u8bad\u7ec3\u4ee3\u7801\uff1a<\/h2>\n\n\n\n<pre class=\"wp-block-code\"><code>#  \u521b\u5efa \u4e0d\u5339\u914d\u7684\u63d0\u793a-\u56de\u7b54\u5bf9\uff1a\u9519\u4f4d\u4f20\u5165\u6279\u6b21\uff08batch\uff09\u4e2d\u7684 answer_input_ids \u548c answer_attention_mask \u6570\u636e\uff0c\u4ee5\u521b\u5efa\u4e0d\u5339\u914d\u7684\u63d0\u793a-\u56de\u7b54\u5bf9\u3002\u83b7\u53d6\u5f53\u524d\u7d22\u5f15\u524d\u4e00\u4e2a\u6837\u672c\u4f5c\u4e3a\u56de\u5e94\uff08response\uff09\u3002\u5982\u679c\u5f53\u524d\u7d22\u5f15\u662f 0\uff0c\u5219\u53d6\u6700\u540e\u4e00\u4e2a\u6837\u672c\u4f5c\u4e3a\u56de\u5e94\u3002\u8fd9\u662f\u4e3a\u4e86\u521b\u5efa\u201c\u4e0d\u5339\u914d\u201d\u7684\u6570\u636e\u5bf9\uff0c\u5373\u63d0\u793a\u548c\u56de\u5e94\u4e0d\u4e00\u5b9a\u662f\u6210\u5bf9\u7684\u3002\nclass UnmatchedSupervisedDataset(SupervisedDataset):\n\n    def preprocess(\n        self, raw_sample_for_prompt: dict&#091;str, Any], raw_sample_for_response: dict&#091;str, Any]\n    ) -&gt; SupervisedSample:\n        return_dict = {}\n        formatted_text, _ = self.template.format_unmatched_supervised_sample(\n            raw_sample_for_prompt, raw_sample_for_response\n        )\n        return_dict&#091;'input_ids'] = self.tokenize(formatted_text)\n\n        return return_dict\n\n    def __getitem__(self, index: int) -&gt; dict&#091;str, torch.Tensor]:\n        \"\"\"Get a tokenized data sample by index.\"\"\"\n        raw_sample_for_prompt = self.raw_data&#091;index]\n        if index == 0:\n            raw_sample_for_response = self.raw_data&#091;-1]\n        else:\n            raw_sample_for_response = self.raw_data&#091;index - 1]\n        data = self.preprocess(raw_sample_for_prompt, raw_sample_for_response)\n        return data\n\n    def get_collator(self) -&gt; Callable&#091;&#091;list&#091;dict&#091;str, torch.Tensor]]], dict&#091;str, torch.Tensor]]:\n        return UnmatchedSupervisedCollator(self.tokenizer.pad_token_id)\n<\/code><\/pre>\n\n\n\n<pre class=\"wp-block-code\"><code>\nclass KTOTrainer(DPOTrainer):\n\n# \u8ba1\u7b97kl\u6563\u5ea6\uff1a\u901a\u8fc7\u8ba1\u7b97\u5f53\u524d\u6a21\u578b\uff08self.model.module\uff09\u548c\u53c2\u8003\u6a21\u578b\uff08self.reference_model.module\uff09\u4e4b\u95f4\u7684 KL \u6563\u5ea6\u6765\u6bd4\u8f83\u5b83\u4eec\u7684\u6982\u7387\u5206\u5e03\n# \u9009\u62e9\u6700\u540e\u4e00\u4e2a batch \u7684 KL \u503c\u53ef\u80fd\u53ea\u662f\u5b9e\u73b0\u4e0a\u7684\u7b80\u5316\u3002\u5b9e\u9645\u4e2d\uff0c\u8ba1\u7b97\u6240\u6709 batch \u7684 KL \u6563\u5ea6\u5e76\u53d6\u5e73\u5747\uff0c\u6216\u8005\u91c7\u53d6\u5176\u4ed6\u66f4\u590d\u6742\u7684\u7b56\u7565\uff0c\u53ef\u80fd\u4f1a\u589e\u52a0\u989d\u5916\u7684\u8ba1\u7b97\u8d1f\u62c5\uff0c\u800c\u9009\u62e9\u6700\u540e\u4e00\u4e2a batch \u7684 KL \u503c\u662f\u4e00\u79cd\u66f4\u76f4\u63a5\u3001\u7b80\u4fbf\u7684\u5b9e\u73b0\u65b9\u5f0f\u3002\ndef compute_kl(self):\n    random_dataset = UnmatchedSupervisedDataset(\n        path=self.cfgs.data_cfgs.train_datasets,\n        template=self.train_template,\n        tokenizer=self.tokenizer,\n        processor=self.processor,\n        name=self.cfgs.data_cfgs.train_name,\n        size=self.cfgs.data_cfgs.train_size,\n        split=self.cfgs.data_cfgs.train_split,\n        data_files=self.cfgs.data_cfgs.train_data_files,\n        optional_args=self.cfgs.data_cfgs.train_optional_args,\n    )\n    seed = torch.randint(0, 100000, (1,)).item()\n    torch.manual_seed(seed)\n    self.random_dataloader = DataLoader(\n        random_dataset,\n        collate_fn=random_dataset.get_collator(),\n        sampler=DistributedSampler(random_dataset, shuffle=True),\n        batch_size=self.cfgs.train_cfgs.per_device_kl_batch_size,\n    )\n    for batch in self.random_dataloader:\n        log_probs = self.compute_log_probs(  # size = (2 * B, L - 1)\n            self.model.module,\n            batch=batch,\n        )\n        ref_log_probs = self.compute_log_probs(  # size = (2 * B, L - 1)\n            self.reference_model.module,\n            batch=batch,\n        )\n        kl = (log_probs - ref_log_probs).mean()\n\n        self.kl = max(kl, 0)<\/code><\/pre>\n\n\n\n<pre class=\"wp-block-code\"><code># \u6b64\u65b9\u6cd5\u662f DPO (Direct Preference Optimization) \u7b97\u6cd5\u7684\u6838\u5fc3\u90e8\u5206\u3002\u5b83\u8ba1\u7b97\u4e86\u5728\u5f53\u524d\u6a21\u578b\u548c\u53c2\u8003\u6a21\u578b\u4e4b\u95f4\u7684\u5bf9\u6bd4\u635f\u5931\n    def loss(  # pylint: disable=too-many-locals\n        self,\n        batch: PreferenceBatch,\n    ) -&gt; dict&#091;str, torch.Tensor]:\n        \"\"\"Loss function for the DPO algorithm.\"\"\"\n        sequence_log_probs = self.compute_log_probs(\n            self.model.module,\n            batch,\n        )\n        (\n            better_sequence_log_probs,  # size = (B, L - 1)\n            worse_sequence_log_probs,  # size = (B, L - 1)\n        ) = sequence_log_probs.chunk(chunks=2, dim=0)\n\n        with torch.no_grad():\n            ref_sequence_log_probs = self.compute_log_probs(  # size = (2 * B, L - 1)\n                self.reference_model.module,\n                batch,\n            )\n            ref_better_sequence_log_probs, ref_worse_sequence_log_probs = (\n                ref_sequence_log_probs.chunk(chunks=2, dim=0)\n            )\n\n        losses = &#091;]\n        better_sample_rewards = &#091;]\n        worse_sample_rewards = &#091;]\n\n        better_input_ids, worse_input_ids = batch&#091;'input_ids'].chunk(chunks=2, dim=0)\n        better_attention_mask, worse_attention_mask = batch&#091;'attention_mask'].chunk(chunks=2, dim=0)\n\n        batch_size = better_input_ids.size(0)\n        for i in range(batch_size):\n            if torch.all(torch.eq(better_input_ids&#091;i], worse_input_ids&#091;i])).item():\n                continue\n            better_end_index = better_attention_mask&#091;i].nonzero()&#091;-1].squeeze().item()\n            worse_end_index = worse_attention_mask&#091;i].nonzero()&#091;-1].squeeze().item()\n            diverge_index = (\n                (better_input_ids&#091;i] != worse_input_ids&#091;i]).nonzero()&#091;0].squeeze().item()\n            )\n            assert 0 &lt;= diverge_index &lt;= better_end_index, 'diverge index is out of range!'\n            assert 0 &lt;= diverge_index &lt;= worse_end_index, 'diverge index is out of range!'\n\n            better_seq_slice = slice(diverge_index, better_end_index + 1)\n            worse_seq_slice = slice(diverge_index, worse_end_index + 1)\n\n            better_log_prob = better_sequence_log_probs&#091;i, better_seq_slice].sum(dim=-1)\n            worse_log_prob = worse_sequence_log_probs&#091;i, worse_seq_slice].sum(dim=-1)\n            ref_better_log_prob = ref_better_sequence_log_probs&#091;i, better_seq_slice].sum(dim=-1)\n            ref_worse_log_prob = ref_worse_sequence_log_probs&#091;i, worse_seq_slice].sum(dim=-1)\n            better_log_ratio = better_log_prob - ref_better_log_prob\n            worse_log_ratio = worse_log_prob - ref_worse_log_prob\n\n# \u8ba1\u7b97loss\uff0ckl\u503c\u4f5c\u4e3a\u57fa\u51c6\n            losses.append(\n                self.cfgs.train_cfgs.scale_better\n                * (1 - F.sigmoid(self.cfgs.train_cfgs.scale_coeff * (better_log_ratio - self.kl)))\n                - self.cfgs.train_cfgs.scale_worse\n                * (1 - F.sigmoid(self.cfgs.train_cfgs.scale_coeff * (self.kl - worse_log_ratio))),\n            )\n            better_sample_rewards.append(\n                self.cfgs.train_cfgs.scale_coeff * better_log_ratio.detach(),\n            )\n            worse_sample_rewards.append(self.cfgs.train_cfgs.scale_coeff * worse_log_ratio.detach())\n        loss = torch.stack(losses).mean()  # size = ()\n        better_sample_reward = torch.stack(better_sample_rewards)  # size = (B,)\n        worse_sample_reward = torch.stack(worse_sample_rewards)  # size = (B,)\n        reward = better_sample_reward + worse_sample_reward  # size = (B,)\n        reward_accuracy = (better_sample_reward &gt; worse_sample_reward).float().mean()  # size = ()\n        reward_margin = better_sample_reward - worse_sample_reward  # size = (B,)\n\n        return {\n            'loss': loss,\n            'reward': reward,\n            'better_sample_reward': better_sample_reward,\n            'worse_sample_reward': worse_sample_reward,\n            'reward_accuracy': reward_accuracy,\n            'reward_margin': reward_margin,\n        }<\/code><\/pre>\n\n\n\n<pre class=\"wp-block-code\"><code>#\u6267\u884c\u8bad\u7ec3\u6b65\u9aa4\uff1a\u8fd9\u4e2a\u65b9\u6cd5\u5728\u6bcf\u4e00\u4e2a\u8bad\u7ec3\u6b65\u4e2d\u8ba1\u7b97\u5e76\u53cd\u5411\u4f20\u64ad\u635f\u5931\u3002\u5b83\u66f4\u65b0\u6a21\u578b\u53c2\u6570\u5e76\u8ba1\u7b97\u5e76\u8fd4\u56de\u8bad\u7ec3\u4fe1\u606f\u3002\n#\u5956\u52b1\u8ba1\u7b97\uff1a\u901a\u8fc7 reward\u3001better_sample_reward \u548c worse_sample_reward \u7b49\u6307\u6807\u6765\u8861\u91cf\u6a21\u578b\u7684\u6027\u80fd\u3002\n#\u5168\u5c40\u5e73\u5747\uff1aget_all_reduce_mean() \u7528\u4e8e\u5206\u5e03\u5f0f\u8bad\u7ec3\uff0c\u786e\u4fdd\u5728\u591a\u4e2a\u8bbe\u5907\u4e0a\u8ba1\u7b97\u7684\u503c\u88ab\u5e73\u5747\uff0c\u4ee5\u4fdd\u8bc1\u8bad\u7ec3\u7684\u4e00\u81f4\u6027\u3002\ndef train_step(self, batch: PreferenceBatch) -&gt; dict&#091;str, Any]:\n    \"\"\"Perform a single training step for KTO.\"\"\"\n    loss_dict = self.loss(batch=batch)\n    loss = loss_dict&#091;'loss']\n    self.model.backward(loss)\n    self.model.step()\n\n    with torch.no_grad():\n        reward = loss_dict&#091;'reward'].mean()\n        better_sample_reward = loss_dict&#091;'better_sample_reward'].mean()\n        worse_sample_reward = loss_dict&#091;'worse_sample_reward'].mean()\n        reward_accuracy = loss_dict&#091;'reward_accuracy']\n        reward_margin = loss_dict&#091;'reward_margin'].mean()\n\n    loss = get_all_reduce_mean(loss)\n    reward = get_all_reduce_mean(reward)\n    better_sample_reward = get_all_reduce_mean(better_sample_reward)\n    worse_sample_reward = get_all_reduce_mean(worse_sample_reward)\n    reward_accuracy = get_all_reduce_mean(reward_accuracy)\n    reward_margin = get_all_reduce_mean(reward_margin)\n\n    return {\n        'train\/loss': loss.item(),\n        'train\/reward': reward.item(),\n        'train\/better_sample_reward': better_sample_reward.item(),\n        'train\/worse_sample_reward': worse_sample_reward.item(),\n        'train\/reward_accuracy': reward_accuracy.item(),\n        'train\/reward_margin': reward_margin.item(),\n        'train\/lr': self.model.optimizer.param_groups&#091;0]&#091;'lr'],\n    }<\/code><\/pre>\n\n\n\n<h1>\u601d\u8003\uff1a<\/h1>\n\n\n\n<p class=\"has-light-pink-background-color has-background\">\u6765\u6e90\uff1a<a href=\"https:\/\/wqw547243068.github.io\/rlhf#%E6%80%9D%E8%80%83-1\"><strong><em>https:\/\/wqw547243068.github.io\/rlhf#%E6%80%9D%E8%80%83-1<\/em><\/strong><\/a><\/p>\n\n\n\n<h2>0\u3001<strong>KL\u60e9\u7f5a<\/strong><\/h2>\n\n\n\n<h3><strong>KL\u662f\u653e\u5728\u5956\u52b1\u51fd\u6570\u91cc\u9762\uff0c\u8fd8\u662f\u653e\u5728\u5916\u9762<\/strong>\uff1f<\/h3>\n\n\n\n<p>PPO \u4e2d\u7684Rt\u8ba1\u7b97\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"1024\" height=\"409\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-41-1024x409.png\" alt=\"\" class=\"wp-image-27782\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-41-1024x409.png 1024w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-41-300x120.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-41-768x307.png 768w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-41.png 1107w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"1020\" height=\"283\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-40.png\" alt=\"\" class=\"wp-image-27780\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-40.png 1020w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-40-300x83.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-40-768x213.png 768w\" sizes=\"(max-width: 1020px) 100vw, 1020px\" \/><\/figure>\n\n\n\n<p>\u7684\u505a\u6cd5\u90fd\u80fd\u89e3\u91ca\u7684\u901a,\u5176\u5b9e\u5b9e\u8d28\u5176\u5b9e\u662f\u4e00\u4e2a\u8d1d\u53f6\u65af\u63a8\u65ad[<strong><a rel=\"noreferrer noopener\" href=\"https:\/\/link.zhihu.com\/?target=https%3A\/\/www.limoncc.com\/post\/bd7e41e00dc8afad\" target=\"_blank\">\u4ecb\u7ecd\u6587\u7ae0]<\/a><\/strong>\u3002\u4e0d\u8fc7<strong>\u52a0\u5165\u5956\u52b1\u51fd\u6570\u91cc\u9762\u63a7\u5236\u7c92\u5ea6\u66f4\u7ec6<\/strong>\uff0c<strong>\u8bad\u7ec3\u5e94\u8be5\u66f4\u52a0\u7a33\u5b9a<\/strong>\u3002\u4f46\u662f\u7693\u5929\u5927\u4f6c\u7528REINFORCE+++\u590d\u73b0\u65f6\u5019<strong>\u52a0\u5165KL\u7ea6\u675f\u4f1a\u9650\u5236\u6a21\u578b\u63a2\u7d22\u7a7a\u95f4<\/strong>\u3002\u7693\u5929\u5927\u4f6c\u7684\u6587\u7ae0\u5f88\u6709\u542f\u53d1\u6027\uff0c\u6307\u51fa\u5728base\u6a21\u578b\u53d8\u5f3a\u4ee5\u540e\uff0c\u5176\u5b9e\u73b0\u6709\u7684RL\u7b97\u6cd5\u5728\u89c4\u5219\u5956\u52b1\u4e0a\u5e94\u8be5\u90fd\u80fdwork\u3002<strong>\u66f4\u52a0\u5e94\u8be5\u63a2\u7d22\u5982\u4f55\u57fa\u4e8e\u5f3abase\u6a21\u578b\u6765\u4f18\u5316RL\u7b97\u6cd5\uff0c\u4e0d\u5e94\u62d8\u6ce5\u4e8e\u539f\u6765RL\u8bad\u7ec3\u4e0d\u7a33\u5b9a\uff0c\u96be\u8bad\u7ec3\u8fd9\u79cd\u4f20\u7edf\u89c2\u5ff5\u3002<\/strong><\/p>\n\n\n\n<p>\u4e0b\u9762\u8fd8\u662f\u7565\u5fae\u6765\u89e3\u91ca\u4e00\u4e0bKL\u7684\u4f5c\u7528\u5b9e\u8d28\u5230\u5e95\u662f\u4ec0\u4e48\uff1a \u4e00\u5f00\u59cb\u6211\u4eec\u901a\u8fc7\u8bed\u6599\u8bad\u7ec3\u4e86\u4e00\u4e2a\u9884\u8bad\u7ec3\u6a21\u578b&nbsp;<em>\u03c0<\/em><sub><em>PT<\/em><\/sub>\uff0c\u7136\u540e\u6709\u5f97\u5230\u4e00\u4e2a&nbsp;<em>\u03c0<\/em><sub><em>SFT<\/em><\/sub>\u6a21\u578b\uff0c\u63a5\u4e0b\u6765\u901a\u8fc7<em>RLHF<\/em>\u6211\u4eec\u8981\u5f97\u5230\u4e00\u4e2a&nbsp;<em>\u03c0<\/em><sub><em>RLHF<\/em><\/sub>\u7684\u6a21\u578b\u3002\u8fd9\u5176\u5b9e\u662f\u4ec0\u4e48\uff1f\u662f\u4e0d\u65ad\u8c03\u6574\u5206\u5e03\u7684\u8fc7\u7a0b\uff0c\u6216\u8005\u8bf4\u662f\u8bed\u8a00\u6a21\u578b\u4e0d\u65ad\u8c03\u6574\u4fe1\u5ff5\u7684\u8fc7\u7a0b\u3002<\/p>\n\n\n\n<p>\u7b26\u53f7\u8bf4\u660e<strong><em>D={(xi,yi)}<\/em><\/strong>\uff0c\u5176\u4e2d&nbsp;<em>xi<\/em>\u8868\u793a\u6307\u4ee4\uff0c&nbsp;<em>yi<\/em>\u662f\u9884\u8bad\u7ec3\u7684\u8bed\u8a00\u6a21\u578b\u7684\u8f93\u51fa\u3002&nbsp;<em>\u03c0(y\u2223x)<\/em>\u662f\u4ece\u6307\u4ee4\u5230\u8f93\u51fa\u7684\u6982\u7387\u5206\u5e03\u3002<\/p>\n\n\n\n<p>1\u3001\u9996\u5148\u4e00\u5f00\u59cb\u6709\u4e00\u4e2a\u5728\u5927\u89c4\u6a21\u8bed\u6599\u4e0a\u8bad\u7ec3\u7684\u8bed\u8a00\u6a21\u578b&nbsp;<em>\u03c00(y\u2223x)<\/em>, \u76ee\u524d\u5b83\u8868\u73b0\u6b20\u4f73\uff0c\u5b83\u7684\u4e16\u754c\u548c\u4eba\u7c7b\u7684\u4e16\u754c\u5dee\u522b\u6709\u70b9\u5927\uff0c\u8bf4\u8d77\u8bdd\u6765\u524d\u8a00\u4e0d\u642d\u540e\u8bed\u3002<\/p>\n\n\n\n<p>2\u3001\u597d\u4e86\u73b0\u5728\u6709\u4e00\u4e2a\u5bf9\u8bdd\u8bed\u6599<em>&nbsp;D={(xi,yi)}<\/em>\uff0c\u8fd9\u4e2a\u5bf9\u8bdd\u9884\u6599\u7684\u7279\u70b9\u5c31\u662f\u771f\u5b9e\u53cd\u5e94\u4e86\u4eba\u7c7b\u4e16\u754c\u7684\u60c5\u51b5\uff0c\u6216\u8005\u8bf4\u57fa\u4e8e\u6b64\u6211\u4eec\u80fd\u751f\u6210\u4e00\u4e2a\u8bc4\u5206\u51fd\u6570<em>&nbsp;r(x,y)<\/em>\u8fd9\u4e2a\u51fd\u6570\u80fd\u7ed9\u8bed\u8a00\u6a21\u578b\u57fa\u4e8e\u6307\u4ee4<em>&nbsp;x<\/em>\u751f\u6210\u7684<em>&nbsp;y<\/em>\u6253\u5206\u3002\u65e2\u7136\u5982\u6b64\u4e0d\u5982\u8fd9\u6837\u601d\u8003<\/p>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-42.png\" alt=\"\" class=\"wp-image-27792\" width=\"257\" height=\"61\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-42.png 412w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-42-300x71.png 300w\" sizes=\"(max-width: 257px) 100vw, 257px\" \/><\/figure>\n\n\n\n<p>\u7ed9\u5b9a&nbsp;x,y\u5bf9\u8bc4\u5206&nbsp;r\u9644\u4e0a\u4e00\u4e2a\u4fe1\u5ff5\u6216\u8005\u6982\u7387&nbsp;q\uff0c\u8fd9\u91cc\u7684&nbsp;\u03b2\u662f\u4e00\u4e2a\u4fe1\u5ff5\u53ef\u8c03\u6574\u7684\u8d85\u53c2\u6570\u3002\u6bd4\u8f83\u662f\u4e2a\u6bd4\u8f83\u4e3b\u89c2\u7684\u4e1c\u897f\uff0c\u52a0\u4e2a\u53ef\u8c03\u6574\u53c2\u6570\u6765\u8c03\u8282\uff0c\u4ee5\u4fbf\u8ba9\u5927\u591a\u6570\u4eba\u6ee1\u610f\u3002<\/p>\n\n\n\n<p>3\u3001\u73b0\u5728\u7684\u95ee\u9898\u5c31\u53d8\u6210\u4e86\u5982\u4f55\u6839\u636e\u521d\u59cb\u6a21\u578b<em>&nbsp;\u03c00(y\u2223x)<\/em>\u548c\u4eba\u7c7b\u7684\u8bc4\u5206\u4fe1\u5ff5&nbsp;<em>q(r\u2223y,x)<\/em>\u6765\u8c03\u6574\u6a21\u578b\u53c2\u6570&nbsp;\u03b8\u5f97\u5230\u4e00\u4e2a\u65b0\u7684\u6a21\u578b&nbsp;<em>\u03c0\u03b8(y\u2223x)<\/em><\/p>\n\n\n\n<p>\u5982\u679c\u8bf4<em>&nbsp;\u03c00(y\u2223x)<\/em>\u662f\u5148\u9a8c\u5206\u5e03\uff0c\u90a3\u4e48\u4eba\u7c7b\u7684\u8bc4\u5206\u4fe1\u5ff5<em>&nbsp;q(r\u2223y,x)<\/em>\u5c31\u662f\u4f3c\u7136\u51fd\u6570\uff0c\u4e8e\u662f\u6211\u4eec\u53ef\u4ee5\u6784\u9020\u4e00\u4e2a\u540e\u9a8c\u5206\u5e03\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-large is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-43-1024x435.png\" alt=\"\" class=\"wp-image-27796\" width=\"579\" height=\"246\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-43-1024x435.png 1024w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-43-300x127.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-43-768x326.png 768w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-43.png 1039w\" sizes=\"(max-width: 579px) 100vw, 579px\" \/><\/figure>\n\n\n\n<p>\u73b0\u5728\u6211\u4eec\u662f\u65e0\u6cd5\u76f4\u63a5\u5f97\u5230&nbsp;\u03c0KL-RL(y\u2223x,r)\uff0c\u8ba1\u7b97\u8bc1\u636eZ(y,x,r)\u8ba1\u7b97\u662f\u5de8\u5927\u7684\u3002\u4f46\u662f\u6211\u4eec\u53ef\u4ee5\u8ba9\u4e00\u4e2a\u5206\u5e03\u63a5\u8fdb\u5b83\uff0c\u6216\u8005\u6700\u597d\u7684\u65b9\u5f0f\u5c31\u662f\u5c31\u5730\u53d6\u6750\u5fae\u8c03&nbsp;<em>\u03c00(y\u2223x,\u03b8)<\/em>\u5f97\u5230<em>&nbsp;\u03c0\u03b8(y\u2223x)<\/em>\u4f7f\u5f97\u5b83\u63a5\u8fd1 \u6211\u4eec\u7684\u540e\u9a8c<em>\u03c0KL-RL(y\u2223x,r)<\/em>\uff0c\u8fd9\u6837\u6211\u4eec\u5c31\u5f97\u5230\u4e86\u5bf9\u9f50\u540e\u7684\u6a21\u578b\u3002\u6211\u4eec\u81ea\u7136\u5c31\u4f7f\u7528\u5230\u4e86\u8ba1\u7b97\u5206\u5e03\u76f8\u4f3c\u5ea6\u7684&nbsp;KL\u6563\u5ea6\u3002\u4e8e\u662f\u95ee\u9898\u5c31\u53d8\u4e3a\u4e86\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"1024\" height=\"364\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-44.png\" alt=\"\" class=\"wp-image-27799\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-44.png 1024w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-44-300x107.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/07\/image-44-768x273.png 768w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p>\u653e\u5728\u5956\u52b1\u51fd\u6570\u91cc\u9762\uff0c\u8fd8\u662f\u653e\u5728\u4f18\u52bf\u51fd\u6570\u5916\u9762\u3002\u4e0d\u8fc7\u662f\u8bc4\u5206\u4fe1\u5ff5\u7684\u4e0d\u540c\uff0c\u5728token\u5c42\u7ea7\u4f3c\u7136\u51fd\u6570\u5c31\u662f\u5956\u52b1\uff0c\u5728\u4f18\u52bf\u51fd\u6570\u5916\u9762\u5c31\u662f\u4f18\u52bf\u51fd\u6570\u3002\u5bf9\u8d1d\u53f6\u65af\u66f4\u65b0\u7684\u63d0\u4f9b\u7684\u4fe1\u606f\u4e0d\u540c\u3001\u7c92\u5ea6\u4e0e\u5c42\u6b21\u7684\u4e0d\u540c\u3002<\/p>\n\n\n\n<h2>1\u3001RL \u6709\u7528\u5417\uff1f<\/h2>\n\n\n\n<p>&nbsp;<a rel=\"noreferrer noopener\" href=\"https:\/\/arxiv.org\/pdf\/2504.13837\" target=\"_blank\">Does Reinforcement Learning Really Incentivize Reasoning Capacity in LLMs Beyond the Base Model?<\/a><\/p>\n\n\n\n<p>RL\u7ed9\u4e0d\u4e86\u65b0\u77e5\u8bc6,\u53ea\u662f<strong>\u6fc0\u53d1<\/strong>\u4e86 Base Model \u80fd\u529b, \u5f3a\u5316\u5b66\u4e60\u7684\u8fb9\u754c\u88ab\u57fa\u5ea7\u6a21\u578b\u201c\u9501\u6b7b\u201d\u3002RL \u53ea\u662f\u8ba9 BaseModel \u671d\u66f4\u80fd\u7ed9\u5230\u6b63\u786e\u7b54\u6848\u7684\u65b9\u5411\u7ed3\u9898, \u5b9e\u9645\u4e0a\uff0cBase Model \u4e0d\u4f1a\u7684,\u53ef\u80fd\u6c38\u8fdc\u4e0d\u4f1a, \u4f1a\u7684,\u5076\u5c14\u80fd\u505a\u5bf9, RL\u80fd\u589e\u52a0\u8fd9\u4e2a\u505a\u5bf9\u7684\u6982\u7387\u3002<\/p>\n\n\n\n<p><code>RLVR<\/code>\uff08\u53ef\u9a8c\u8bc1\u5956\u52b1\u7684\u5f3a\u5316\u5b66\u4e60\uff09\u5728\u6570\u5b66\u3001\u4ee3\u7801\u3001\u89c6\u89c9\u63a8\u7406\u7b49\u4efb\u52a1\u4e2d\u8868\u73b0\u7a81\u51fa\uff0c\u88ab\u89c6\u4e3a\u63d0\u5347\u5927\u8bed\u8a00\u6a21\u578b\uff08LLM\uff09\u63a8\u7406\u80fd\u529b\u7684\u5173\u952e\u624b\u6bb5\u3002<\/p>\n\n\n\n<p>\u7136\u800c\uff0c\u6838\u5fc3\u95ee\u9898\u59cb\u7ec8\u5b58\u5728\uff1a<strong>\u5f3a\u5316\u5b66\u4e60<\/strong>\u771f\u80fd\u8ba9\u5927\u6a21\u578b\u83b7\u5f97\u8d85\u8d8a<strong>\u57fa\u5ea7\u6a21\u578b<\/strong>\u7684\u65b0\u63a8\u7406\u80fd\u529b\uff1f<\/p>\n\n\n\n<p>\u5728\u6570\u5b66\u3001\u4ee3\u7801\u3001\u89c6\u89c9\u63a8\u7406\u4e09\u5927\u9886\u57df\u7684\u7cfb\u7edf\u6027\u5b9e\u9a8c\u53d1\u73b0\uff1a<\/p>\n\n\n\n<ul><li><strong>\u80fd\u529b\u8fb9\u754c\u672a\u7a81\u7834<\/strong>\uff1aRLVR\u6a21\u578b\u6240\u6709\u63a8\u7406\u8def\u5f84\u5747<strong>\u5df2\u5b58\u5728<\/strong>\u4e8e\u57fa\u7840\u6a21\u578b\u4e2d\uff0c\u5f3a\u5316\u5b66\u4e60<strong>\u5e76\u672a<\/strong>\u8d4b\u4e88\u6a21\u578b\u65b0\u7684\u63a8\u7406\u80fd\u529b\u3002<\/li><li><strong>\u91c7\u6837\u6548\u7387\u4e0e\u8986\u76d6\u80fd\u529b\u6743\u8861<\/strong>\uff1aRLVR&nbsp;<strong>\u5c0f\u91c7\u6837<\/strong>\u6b21\u6570\u4e0b\u8868\u73b0\u4f18\u4e8e\u57fa\u5ea7\u6a21\u578b\uff0c\u4f46\u968f\u7740\u91c7\u6837\u6b21\u6570\u589e\u52a0\uff0c<strong>\u57fa\u5ea7\u6a21\u578b\u9010\u6e10\u8ffd\u5e73\u5e76\u53cd\u8d85<\/strong>\uff0c\u663e\u793a\u51fa\u66f4\u5e7f\u6cdb\u7684\u8986\u76d6\u80fd\u529b\u3002<\/li><li><strong>\u7b54\u6848\u540c\u6e90\u6027<\/strong>\uff1aRLVR\u6a21\u578b\u6b63\u786e\u7b54\u6848<strong>\u5747<\/strong>\u6765\u81ea<strong>\u57fa\u5ea7\u6a21\u578b<\/strong>\u7684\u8f93\u51fa\u5206\u5e03\uff0c\u5f3a\u5316\u5b66\u4e60\u53ea\u662f\u901a\u8fc7<strong>\u8c03\u6574\u6982\u7387\u5206\u5e03<\/strong>\u7b5b\u9009\u9ad8\u5956\u52b1\u8def\u5f84\u3002<\/li><\/ul>\n\n\n\n<p>\u663e\u793a\uff1a<\/p>\n\n\n\n<ul><li>\u6570\u5b66\u63a8\u7406\u4efb\u52a1\u4e2d\uff0c\u57fa\u5ea7\u6a21\u578b\u5728\u591a\u6b21\u91c7\u6837\u540e\u7684\u80fd\u529b\u8868\u73b0\u9010\u6e10<strong>\u8ffd\u5e73\u5e76\u53cd\u8d85<\/strong>RL\u6a21\u578b\u3002<\/li><li>\u4ee3\u7801\u751f\u6210\u4efb\u52a1\u4e2d\uff0cRL\u6a21\u578b\u63d0\u5347\u4e86<strong>\u5355\u6837\u672c<\/strong>\u51c6\u786e\u7387\uff0c\u4f46\u5728\u66f4\u9ad8\u91c7\u6837\u6b21\u6570\u4e0b\uff0c\u57fa\u5ea7\u6a21\u578b\u4ecd\u5c55\u73b0\u51fa<strong>\u66f4\u5f3a<\/strong>\u7684\u8986\u76d6\u80fd\u529b\u3002<\/li><li>\u89c6\u89c9\u63a8\u7406\u4efb\u52a1\u4e2d\uff0cRL\u8bad\u7ec3\u540e\u7684\u6a21\u578b\u5728<strong>\u5355\u6b21<\/strong>\u56de\u7b54\u51c6\u786e\u7387\u4e0a\u63d0\u5347\u663e\u8457\uff0c\u4f46\u57fa\u5ea7\u6a21\u578b\u5728<strong>\u591a\u6b21\u91c7\u6837<\/strong>\u540e\u4ecd\u8868\u73b0\u51fa\u66f4\u5e7f\u6cdb\u7684\u95ee\u9898\u8986\u76d6\u80fd\u529b<\/li><\/ul>\n\n\n\n<p>RLVR\u53ea\u662f\u8ba9\u6a21\u578b\u66f4<strong>\u504f\u5411\u9ad8\u5956\u52b1<\/strong>\u89e3\u51b3\u65b9\u6848\uff0c\u800c\u975e<strong>\u521b\u9020<\/strong>\u65b0\u7684\u63a8\u7406\u80fd\u529b\u3002<\/p>\n\n\n\n<p>\u5bf9\u5956\u52b1\u8def\u5f84\u7684\u805a\u7126\uff0c<strong>\u524a\u5f31\u4e86\u6a21\u578b\u7684\u63a2\u7d22\u80fd\u529b<\/strong>\uff0c\u9650\u5236\u4e86\u5927\u89c4\u6a21\u91c7\u6837\u65f6\u5bf9\u53ef\u89e3\u95ee\u9898\u7684\u8986\u76d6\u8303\u56f4\u3002<\/p>\n\n\n\n<p>Key Insights<\/p>\n\n\n\n<ol><li>\u5c3d\u7ba1RL\u8bad\u7ec3\u540e\u7684\u6a21\u578b\u5728 pass@k(k=1) \u60c5\u51b5\u4e0b\u8d85\u8d8a Base\u6a21\u578b, \u4f46\u662f BaseModel \u5728k\u503c\u4e0d\u505a\u9650\u5236\u60c5\u51b5\u4e0b, \u53ef\u80fd\u6bd4RL\u540e\u7684\u6a21\u578bpass\u7387\u8fd8\u9ad8;<\/li><li>RL \u53ea\u662f\u4f18\u5316\u4e86 Base Model&nbsp;<strong>\u91c7\u6837\u6548\u7387<\/strong>, \u4e00\u65b9\u9762\u589e\u52a0\u4e86Base Model\u4e00\u6b21\u5c31\u80fd\u505a\u5bf9\u9898\u7684\u6982\u7387,\u4f46\u540c\u65f6<strong>\u9650\u5236\u4e86\u6a21\u578b\u7684\u63a2\u7d22\u80fd\u529b<\/strong>,\u5bfc\u81f4\u4e86\u5728\u589e\u52a0pass@k\u7684k\u65f6\u5019, Base Model \u505a\u5bf9\u9898\u7684\u6982\u7387\u53cd\u800c\u589e\u52a0\u4e86;<\/li><li>CoT \u65b9\u6cd5\u5bf9\u6a21\u578b Finetune \u66f4\u80fd\u6fc0\u53d1\u6a21\u578b\u7684\u505a\u9898\u80fd\u529b<\/li><\/ol>\n\n\n\n<p>\u5bf9\u6bd4 CoT\u5bf9\u6a21\u578b\u8fdb\u884cFinetune<\/p>\n\n\n\n<ol><li>\u4eceR1\u84b8\u998f\u7684\u6570\u636e\u5bf9\u6a21\u578b\u76f4\u63a5\u8fdb\u884cCoT Finetune,\u5728\u540c\u6837\u591a\u6b21Sample\u770bpass\u7ed3\u679c\u4e0a, <strong>CoT \u786e\u5b9e\u662f\u5728 Base Model\u4e0a\u8db3\u91cf\u63d0\u5347,\u8d85\u8d8a Base Model,\u5e76\u6bd4RL\u7684\u7ed3\u679c\u66f4\u597d<\/strong>\u3002\u4f46\u8fd9\u4e2a\u56fe\u91cc\u9762\u5947\u602a\u7684\u662fInstruct\u7684\u6a21\u578b\u751a\u81f3\u6ca1\u6709Base\u7248\u672c\u5728AIME24\u7684\u8868\u73b0\u4e0a\u597d?<\/li><li>\u4e0d\u540cRL\u7b97\u6cd5\u6574\u4f53\u5dee\u5f02\u5e76\u4e0d\u5927\u3002\u4e0d\u540c\u7684RL\u7b97\u6cd5,\u6bd4\u5982PPO,DAPO,GRPO\u7b49<\/li><\/ol>\n\n\n\n<p>\u601d\u8003<\/p>\n\n\n\n<p>\u4e3a\u4ec0\u4e48 AlphaGO \u548c\u73a9\u6e38\u620f, RL\u80fd\u53d1\u6398\u65b0\u7684\u80dc\u5229\u6a21\u5f0f, \u800c LLM \u4e2d\u7684RL\u4e0d\u884c?<\/p>\n\n\n\n<ol><li><strong>LLM \u8f93\u51fatoken\u6982\u7387\u7a7a\u95f4\u6bd4\u6e38\u620f\u6982\u7387\u7a7a\u95f4\u5927\u5f88\u591a<\/strong>, \u56e0\u6b64\uff0cRL\u4f18\u5316LLM \u66f4\u96be, \u5e76\u4e14Reasoning \u7ecf\u5e38\u662f\u4ecePretrain Model\u5f00\u59cb\u8bad\u7ec3, \u800cPretrain\u6a21\u578b\u672c\u8eab\u53d7\u9650\u5236\u4e8e\u9884\u8bad\u7ec3\u7684\u8bed\u6599,\u8bad\u7ec3\u6e38\u620f\u7684\u4e00\u822c\u90fd\u662f\u968f\u673a\u521d\u59cb\u5316,\u5bfc\u81f4\u53ef\u80fdPretrain\u6a21\u578b\u672c\u8eab\u5c31\u4e0d\u5305\u542b\u6240\u6709\u80fd\u89e3\u51b3\u95ee\u9898\u7684\u5148\u9a8c(\u6bd4\u5982\u4e00\u4e2a\u95ee\u9898\u6c38\u8fdc\u7b54\u4e0d\u5bf9,Reward\u6c38\u8fdc\u662f0),\u800c\u968f\u673a\u521d\u59cb\u5316\u7684\u53ef\u80fd\u672c\u8eab\u5c31\u5b58\u5728\u53ef\u80fd\u4e3a1\u7684\u60c5\u51b5,RL\u624d\u6709\u53ef\u80fd\u627e\u5230\u6b63\u786e\u7b54\u6848\u3002<\/li><li>Pretrain \u6a21\u578b\u7684<strong>\u5148\u9a8c\u77e5\u8bc6\u9650\u5236\u592a\u5f3a<\/strong>, \u5bfc\u81f4\u6a21\u578b\u63a2\u7d22\u8bf4\u8bdd\u7a7a\u95f4\u65f6,\u4f1a\u56e0\u4e3a\u9519\u8bef\u683c\u5f0f\u6216\u8005\u8bed\u53e5\u4e0d\u901a\u88ab\u5e72\u6389, \u5373\u4f7f\u6709\u53ef\u80fd\u5bfc\u51fa\u6b63\u786e\u7b54\u6848,\u4e5f\u4f1a\u56e0\u4e3a\u4e2d\u95f4\u6b65\u9aa4\u4ea7\u751f\u95ee\u9898,\u800c\u6c38\u8fdc\u5931\u8d25;<\/li><li>RL\u7b97\u6cd5<strong>\u8bbe\u8ba1\u673a\u5236<\/strong>\u6f5c\u5728\u9650\u5236\u4e86\u6a21\u578b\u63a2\u7d22\u6b63\u786e\u7b54\u6848\u7684\u53ef\u80fd\u6027, \u6bd4\u5982 PPO\u7b97\u6cd5\u4e2d\u7684<code>KL Divergence<\/code>\u7ea6\u675f\u4e86\u6a21\u578b\u524d\u540e\u6982\u7387\u5206\u5e03\u4e0d\u80fd\u5dee\u522b\u8fc7\u5927\u3002<\/li><\/ol>\n\n\n\n<p>\u5f88\u591a\u4eba\u8ba4\u4e3a\uff0c\u5f3a\u5316\u5b66\u4e60\uff08RL\uff09\u80fd\u591f\u6cdb\u5316\u5230\u4e0d\u540c\u4efb\u52a1\u4e2d\uff0c\u76d1\u7763\u5fae\u8c03\uff08SFT\uff09\u53ef\u4ee5\u8bb0\u5fc6\u77e5\u8bc6\u70b9\uff0c\u53e6\u5916\uff0c\u8fd8\u6709\u7c7b\u4f3c R1-zero \u7684\u7ed3\u8bba\u7b49\u3002<\/p>\n\n\n\n<p>\u800c\u5982\u4eca\u6574\u4f53\u5f00\u6e90\u793e\u533a\u7684\u63a2\u7d22\u5df2\u7ecf\u6df1\u5165\u4e86\u8bb8\u591a\u3002<\/p>\n\n\n\n<ul><li>\u89e3\u9898\u65b9\u9762\uff0c\u6784\u5efa\u51fa\u8272\u7684\u57fa\u7840\u6a21\u578b\uff08Base Model\uff09\u3002<\/li><li>\u5b9e\u9645\u4e0a\uff0c\u5f88\u591a\u57fa\u7840\u6a21\u578b\u90fd\u5df2\u7ecf\u8fc7\u6307\u4ee4\u5fae\u8c03\uff0c\u53ea\u662f\u6ca1\u6709\u7ecf\u8fc7\u590d\u6742\u6307\u4ee4\u5fae\u8c03\uff0c\u6240\u4ee5\u5f88\u96be\u7b80\u5355\u5730\u5c06\u5176\u8ba4\u5b9a\u4e3a\u4e00\u4e2a\u5355\u7eaf\u7684\u9884\u8bad\u7ec3\uff08Pretrain\uff09\u6a21\u578b\uff0c\u6bd5\u7adf\u9884\u8bad\u7ec3\u548c\u76d1\u7763\u5fae\u8c03\u7684\u5b66\u4e60\u6a21\u5f0f\u57fa\u672c\u76f8\u540c\u3002\u5728\u8fd9\u4e2a\u57fa\u7840\u6a21\u578b\u4e4b\u4e0a\u8fdb\u884c\u5f3a\u5316\u5b66\u4e60\uff08RL\uff09\u64cd\u4f5c\uff0c\u80fd\u591f\u63d0\u5347\u5b83\u89e3\u51b3\u67d0\u7c7b\u95ee\u9898\u7684\u80fd\u529b\u3002<\/li><li>\u601d\u7ef4\u94fe\uff08CoT\uff09\u672c\u8d28\u662f\u4ec0\u4e48\uff0c\u4e3a\u4ec0\u4e48\u80fd\u63d0\u9ad8\u7b54\u6848\u7684\u51c6\u786e\u6027\uff1f\u601d\u7ef4\u94fe\u672c\u8d28\u4e0a\u5c31\u662f \u201c\u5927\u58f0\u601d\u8003\u201d\uff08Thinkout loud\uff09\u3002<ul><li>\u5bf9\u4e8e\u6709\u660e\u786e\u7b54\u6848\u7684\u95ee\u9898\uff0c\u901a\u8fc7\u601d\u7ef4\u94fe\u6765\u68c0\u67e5\u5176\u89e3\u51b3\u95ee\u9898\u7684\u6b65\u9aa4\u662f\u5426\u9519\u8bef\uff0c\u4ee5\u53ca\u7ed3\u679c\u662f\u5426\u6b63\u786e\uff1b<\/li><li>\u5bf9\u4e8e\u5f00\u653e\u6027\u95ee\u9898\u800c\u8a00\uff0c\u601d\u7ef4\u94fe\u589e\u52a0\u56de\u7b54\u7684\u53ef\u4fe1\u5ea6\u3002<\/li><li>\u601d\u7ef4\u94fe\u672c\u8d28\u662f<strong>\u7ed3\u6784\u5316\u601d\u8003<\/strong>\uff0c\u8a00\u4e4b\u6709\u7406\u5373\u53ef\uff0c\u6a21\u578b\u53ef\u89e3\u91ca\u6027\u7684\u53e6\u4e00\u79cd\u4f53\u73b0\u3002<\/li><\/ul><\/li><\/ul>\n\n\n\n<p>\u56e0\u6b64\uff0c\u601d\u7ef4\u94fe\u65b9\u5411\u53ef\u4ee5\u7ee7\u7eed\u9274\u5b9a\u7684\u8d70\u4e0b\u53bb\uff0c\u540c\u65f6\uff0c\u6a21\u578b\u672c\u8eab\u7684\u9650\u5236\u5e94\u8be5\u66f4\u5c11\u4e9b\uff0c\u6bd4\u5982\u4e71\u7801\u6ca1\u5173\u7cfb\uff0c\u9884\u7559\u66f4\u591a\u63a2\u7d22\u7a7a\u95f4<\/p>\n\n\n\n<h2 id=\"rl-\u6838\u5fc3\u5728\u4e8e-\u5956\u52b1\u51fd\u6570\">2\u3001RL \u6838\u5fc3\u5728\u4e8e \u5956\u52b1\u51fd\u6570<\/h2>\n\n\n\n<p>\u30102025-5-5\u3011\u5ffd\u7565\u5f3a\u5316\u5b66\u4e60\u7b97\u6cd5\u7ec6\u8282\uff0c\u5728reward\u4e0a\u505a\u70b9\u624b\u811a\uff0c\u7b80\u5355\u53c8\u91cd\u8981<\/p>\n\n\n\n<ul><li>\u53c2\u8003&nbsp;<a href=\"https:\/\/www.xiaohongshu.com\/explore\/6818a85c000000002301e1d7\" target=\"_blank\" rel=\"noreferrer noopener\">\u5c0f\u7ea2\u4e66\u5e16\u5b50<\/a><\/li><\/ul>\n\n\n\n<p>(1) rl \u4e0e reward<\/p>\n\n\n\n<p>RLHF \u7cbe\u9ad3: \u5c06<strong>\u4eba\u7c7b\u504f\u597d<\/strong>\u8f6c\u5316\u4e3a<strong>\u53ef\u91cf\u5316<\/strong>\u7684\u5956\u52b1\u4fe1\u53f7\u3002<\/p>\n\n\n\n<p>\u5956\u52b1\u51fd\u6570\u544a\u8bc9\u6a21\u578b\u201d\u4ec0\u4e48\u662f\u597d\u7684\u8f93\u51fa\u201d\uff0c\u800crl\u7b97\u6cd5\u53ea\u662f\u5c06\u8fd9\u79cd\u53cd\u9988\u8bad\u7ec3\u5230\u6a21\u578b\u53c2\u6570\u4e2d\u53bb\u3002reward\u4e0e\u6784\u5efa\u9ad8\u8d28\u91cf\u6570\u636e\uff0c\u5bf9\u4e8erl\u6700\u7ec8\u7684\u7ed3\u679c\u6765\u8bf4\u540c\u6837\u91cd\u8981\u3002<\/p>\n\n\n\n<p>deepseek-r1\u7684grpo\u5219\u662f\u9488\u5bf9\u6570\u5b66\u548c\u4ee3\u7801\u4efb\u52a1\u8bbe\u8ba1\u4e86<strong>\u89c4\u5219\u5224\u522b<\/strong>\u7684\u5956\u52b1\u51fd\u6570<\/p>\n\n\n\n<p>(2) \u5956\u52b1\u51fd\u6570\u6784\u5efa\u7b56\u7565<\/p>\n\n\n\n<p>reward \u6784\u9020\u7b56\u7565<\/p>\n\n\n\n<ul><li><strong>\u4efb\u52a1\u76f8\u5173\u6027<\/strong>\uff1a\u5956\u52b1\u4fe1\u53f7\u4e0e\u4efb\u52a1\u76ee\u6807\u76f8\u5173\u3002<ul><li>\u6570\u5b66\u95ee\u9898\u5173\u6ce8<strong>\u6b63\u786e\u6027<\/strong>\uff0c\u5199\u4f5c\u6ce8\u91cd<strong>\u591a\u6837\u6027<\/strong>\uff0c\u9500\u552e\u52a9\u624b\u9700\u8981<strong>\u60c5\u5546<\/strong>\u7b49<\/li><\/ul><\/li><li><strong>\u53ef\u91cf\u5316<\/strong>\uff1a\u53ef\u91cf\u5316\u7684\u6307\u6807\u624d\u53ef\u4ee5\u4ea4\u7ed9rl\u8fdb\u884c\u8bad\u7ec3\u5b66\u4e60\u3002<ul><li>\u7b54\u6848\u5bf9\u9519\u7531\u89c4\u5219\u5224\u65ad\u7ed90,1\u5e03\u5c14\u503c\u3002\u8fd9\u4e2a\u56de\u7b54\u5f88\u597d\u7531reward model\u8f6c\u5316\u4e3a0~1.0\u4e4b\u95f4\u7684\u5f97\u5206<\/li><\/ul><\/li><li><strong>\u76f8\u5bf9\u6027<\/strong>\uff1aPPO\u4e2d\u7ed9\u7684\u662f\u67d0\u4e2a\u7b54\u6848\u7684<strong>\u7edd\u5bf9\u5956\u52b1\u503c<\/strong>(\u901a\u8fc7pairwise \u65b9\u5f0f\u8bad\u7ec3 reward model)\uff0cDPO\u4e2d\u5219\u662f\u6784\u5efa\u7b54\u6848\u95f4\u7684<strong>\u76f8\u5bf9\u504f\u597d<\/strong>\u5173\u7cfb\uff0cGRPO \u8ba1\u7b97\u4e00\u6279\u6837\u672c\u7684<strong>\u76f8\u5bf9\u5956\u52b1\u4f18\u52bf<\/strong><\/li><li><strong>\u63a8\u7406\u8fc7\u7a0b<\/strong>\uff1a\u5bf9\u6574\u4e2a\u8fc7\u7a0b\u7ed9\u4e00\u4e2a\u6700\u7ec8\u5956\u52b1\uff0c\u8fd8\u662f\u6bcf\u4e2a\u63a8\u7406\u6b65\u9aa4\u8bc4\u4f30\uff0c\u4ee5\u53ca\u662f\u5426\u9700\u8981\u63a8\u7406\u8fc7\u7a0b\uff0c\u90fd\u53ef\u4ee5\u8bbe\u7f6e\u4e3a\u5956\u52b1\u4fe1\u53f7<\/li><\/ul>\n\n\n\n<h2 id=\"\u65b9\u6cd5\u9009\u62e9\">3\u3001\u65b9\u6cd5\u9009\u62e9<\/h2>\n\n\n\n<p>\u4f55\u65f6\u4f7f\u7528 DPO \u4e0e PPO \u4e0e GRPO \uff1f<\/p>\n\n\n\n<p>\u504f\u597d\u5bf9\u9f50\u7ec4\u5408\u4e2d\u52a0\u5165 GRPO \u540e\uff0c\u6709\u51e0\u9879\u51b3\u7b56\u56e0\u7d20\u9700\u8981\u8003\u8651\uff1a<\/p>\n\n\n\n<ol><li><strong>\u6570\u636e\u53ef\u7528\u6027<\/strong>\uff08\u662f\u5426\u6709\u504f\u597d\u6570\u636e\uff09\uff1aDPO \u4f7f\u7528\u504f\u597d\u6570\u636e\uff08\u9009\u62e9\/\u62d2\u7edd\u7684\u7b54\u6848\uff09\uff0c\u800c PPO \u5219\u9700\u8981\u5148\u7528\u8fd9\u79cd\u504f\u597d\u6570\u636e\u8bad\u7ec3\u4e00\u4e2a\u5956\u52b1\u6a21\u578b\u3002GRPO \u5219\u66f4\u5177\u7075\u6d3b\u6027\uff0c\u56e0\u4e3a\u5b83\u53ef\u4ee5\u4f7f\u7528\u504f\u597d\u6570\u636e\uff0c\u4f46\u5e76\u975e\u5fc5\u987b\u4f7f\u7528\u3002<\/li><li><strong>\u5956\u52b1\u6a21\u578b<\/strong>\uff1aDPO \u901a\u8fc7\u76f4\u63a5\u57fa\u4e8e\u504f\u597d\u8fdb\u884c\u4f18\u5316\uff0c\u5c06\u95ee\u9898\u6784\u5efa\u6210\u5206\u7c7b\u95ee\u9898\uff0c\u4ece\u800c\u6d88\u9664\u4e86\u5bf9\u5355\u72ec\u5956\u52b1\u6a21\u578b\u7684\u9700\u6c42\u3002\u76f8\u6bd4\u4e4b\u4e0b\uff0cPPO \u5219\u9700\u8981\u8bad\u7ec3\u548c\u7ef4\u62a4\u4e00\u4e2a\u5355\u72ec\u7684\u5956\u52b1\u6a21\u578b\uff0c\u8fd9\u589e\u52a0\u4e86\u590d\u6742\u6027\u3002GRPO \u5219\u5904\u4e8e\u4e24\u8005\u4e4b\u95f4\uff0c\u65e2\u652f\u6301\u4f7f\u7528\u663e\u5f0f\u7684\u5956\u52b1\u6a21\u578b\uff08\u5982 PPO\uff09\uff0c\u4e5f\u652f\u6301\u76f4\u63a5\u4f7f\u7528\u5956\u52b1\u51fd\u6570\u3002<\/li><li><strong>\u8ba1\u7b97\u8d44\u6e90<\/strong>\uff1aDPO \u6700\u9ad8\u6548\uff0c\u56e0\u4e3a\u65e0\u9700\u6dfb\u52a0\u5956\u52b1\u6a21\u578b\u3002PPO \u8ba1\u7b97\u9700\u6c42\u6700\u9ad8\uff0c\u56e0\u4e3a\u5b83\u9700\u8981\u591a\u4e2a\u6a21\u578b\u3002GRPO \u7531\u4e8e\u91c7\u7528\u4e86\u57fa\u4e8e\u7ec4\u7684\u65b9\u6cd5\uff0c\u6240\u4ee5\u6240\u9700\u7684\u8d44\u6e90\u9002\u4e2d\u3002<\/li><\/ol>\n\n\n\n<figure class=\"wp-block-table\"><table><thead><tr><th>\u5bf9\u6bd4\u9879<\/th><th>DPO<\/th><th>PPO<\/th><th>GRPO<\/th><\/tr><\/thead><tbody><tr><td>\u6570\u636e\u53ef\u7528\u6027<\/td><td>\u504f\u597d\u6570\u636e\uff08\u9009\u62e9\/\u62d2\u7edd\u7684\u7b54\u6848\uff09<\/td><td>\u5148\u7528\u504f\u597d\u6570\u636e\u8bad\u7ec3\u5956\u52b1\u6a21\u578b<\/td><td>\u66f4\u7075\u6d3b\uff0c\u53ef\u7528\u504f\u597d\u6570\u636e\uff0c\u4f46\u5e76\u975e\u5fc5\u987b<\/td><\/tr><tr><td>\u5956\u52b1\u6a21\u578b<\/td><td>\u76f4\u63a5\u57fa\u4e8e\u504f\u597d\u8fdb\u884c\u4f18\u5316\uff0c\u5c06\u95ee\u9898\u6784\u5efa\u6210<strong>\u5206\u7c7b<\/strong>\u95ee\u9898\uff0c\u6d88\u9664\u5bf9\u5355\u72ec\u5956\u52b1\u6a21\u578b\u7684\u9700\u6c42<\/td><td>\u8bad\u7ec3\u548c\u7ef4\u62a4\u5355\u72ec\u7684\u5956\u52b1\u6a21\u578b\uff0c\u589e\u52a0\u4e86\u590d\u6742\u6027<\/td><td>\u65e2\u652f\u6301\u4f7f\u7528\u663e\u5f0f\u7684\u5956\u52b1\u6a21\u578b\uff08\u5982 PPO\uff09\uff0c\u4e5f\u652f\u6301\u76f4\u63a5\u4f7f\u7528\u5956\u52b1\u51fd\u6570<\/td><\/tr><tr><td>\u8ba1\u7b97\u8d44\u6e90<\/td><td>\u6700\u9ad8\u6548\uff0c\u65e0\u9700\u6dfb\u52a0\u5956\u52b1\u6a21\u578b<\/td><td>\u8ba1\u7b97\u9700\u6c42\u6700\u9ad8\uff0c\u9700\u8981\u591a\u4e2a\u6a21\u578b<\/td><td>\u7531\u4e8e\u91c7\u7528\u57fa\u4e8e\u7ec4\u7684\u65b9\u6cd5\uff0c\u6240\u9700\u8d44\u6e90\u9002\u4e2d<\/td><\/tr><\/tbody><\/table><\/figure>\n\n\n\n<p>\u8981\u70b9<\/p>\n\n\n\n<ul><li>\u5f53\u62e5\u6709\u9ad8\u8d28\u91cf\u7684\u504f\u597d\u6570\u636e\u4e14\u8ba1\u7b97\u8d44\u6e90\u6709\u9650\u65f6\uff0c\u9009\u62e9 DPO\u3002<\/li><li>\u5f53\u9700\u8981\u7cbe\u7ec6\u63a7\u5236\u3001\u62e5\u6709\u5145\u8db3\u7684\u8ba1\u7b97\u8d44\u6e90\u5e76\u4e14\u80fd\u591f\u6295\u5165\u7cbe\u529b\u8fdb\u884c\u4ed4\u7ec6\u8c03\u6574\u65f6\uff0c\u9009\u62e9 PPO\u3002<\/li><li>\u5f53\u60f3\u8981\u6574\u5408\u591a\u4e2a\u5956\u52b1\u4fe1\u53f7\uff0c\u6216\u8005\u6ca1\u6709\u5168\u9762\u7684\u504f\u597d\u6570\u636e\u65f6\uff0c\u9009\u62e9 GRPO\u3002<\/li><\/ul>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"565\" height=\"535\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-87.png\" alt=\"\" class=\"wp-image-27259\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-87.png 565w, http:\/\/139.9.1.231\/wp-content\/uploads\/2025\/06\/image-87-300x284.png 300w\" sizes=\"(max-width: 565px) 100vw, 565px\" \/><\/figure>\n\n\n\n<h2 id=\"loss\">loss<\/h2>\n\n\n\n<h4 id=\"\u4e3a\u4ec0\u4e48\u4e0d\u7528-\u68af\u5ea6\u4e0b\u964d-\">\u4e3a\u4ec0\u4e48\u4e0d\u7528 \u68af\u5ea6\u4e0b\u964d \uff1f<\/h4>\n\n\n\n<p>RLHF \u4e3a\u4ec0\u4e48\u4e0d\u76f4\u63a5\u5bf9 loss \u8fdb\u884c<strong>\u68af\u5ea6\u4e0b\u964d<\/strong>\u6765\u6c42\u89e3\uff1f<\/p>\n\n\n\n<p>\u6838\u5fc3\u539f\u56e0\uff1a<\/p>\n\n\n\n<ul><li>loss \u6216\u4f18\u5316\u76ee\u6807<strong>\u4e0d\u53ef\u5fae<\/strong>\uff0c\u770b\u4e00\u4e0b\u4f18\u5316\u76ee\u6807\u7684\u7ea2\u8272\u6846\u90e8\u5206\uff1a<\/li><\/ul>\n\n\n\n<p>\u635f\u5931\u51fd\u6570\u8868\u8fbe\u5f0f\u4e2d\u7684 y \u662f\u91c7\u6837\u51fa\u6765\u7684,&nbsp;<code>Dy~pi(y|x)<\/code>&nbsp;, \u53ef\u80fd\u662f greedy\uff0cbeam search \u7b49\uff0c\u5728\u8bcd\u8868\u4e0a\u8fdb\u884c\u91c7\u6837\u6216\u9009\u62e9\uff0c\u800c\u4e0d\u662f\u4ea7\u751f\u8fde\u7eed\u7684\u3001\u53ef\u5fae\u5206\u7684\u8f93\u51fa\u3002\u6240\u4ee5\uff0c\u6ca1\u6cd5\u76f4\u63a5\u4f7f\u7528\u68af\u5ea6\u4e0b\u964d\uff0c\u800c\u662f\u7528 PPO \u7b49<strong>\u7b56\u7565\u68af\u5ea6<\/strong>\u6765\u6c42\u89e3\u3002<\/p>\n\n\n\n<h2 id=\"rlhf-\u95ee\u9898\">RLHF \u95ee\u9898<\/h2>\n\n\n\n<p>\u30102025-2-6\u3011<a href=\"https:\/\/mp.weixin.qq.com\/s\/thTwdVgc4lfYRj6WWpKBwA\" target=\"_blank\" rel=\"noreferrer noopener\">Andrej Karpathy \u6700\u65b0\u89c6\u9891\u76db\u8d5e DeepSeek\uff1aR1 \u6b63\u5728\u53d1\u73b0\u4eba\u7c7b\u601d\u8003\u7684\u903b\u8f91\u5e76\u8fdb\u884c\u590d\u73b0<\/a><\/p>\n\n\n\n<ul><li>\u89c6\u9891\u94fe\u63a5\uff1a<a href=\"https:\/\/www.youtube.com\/watch?v=7xTGNNLPyMI\" target=\"_blank\" rel=\"noreferrer noopener\">youtube<\/a><\/li><li>DeepSeek R1 \u5728\u6027\u80fd\u65b9\u9762\u4e0e OpenAI \u6a21\u578b\u4e0d\u76f8\u4e0a\u4e0b\uff0c\u63a8\u52a8\u4e86 RL \u6280\u672f\u7684\u53d1\u5c55<\/li><\/ul>\n\n\n\n<p>\u5982\u679c\u53ea\u662f\u6a21\u4eff\u4eba\u7c7b\u73a9\u5bb6\uff0c\u5c31\u6c38\u8fdc\u65e0\u6cd5\u8d85\u8d8a\u6781\u9650\u3002<\/p>\n\n\n\n<p>\u5f3a\u5316\u5b66\u4e60\u7684\u4f18\u52bf<\/p>\n\n\n\n<ul><li>\u4e0d\u53d7\u4eba\u7c7b\u8868\u73b0\u7684\u9650\u5236\u3002<ul><li>\u56f4\u68cb\u6e38\u620f\u4e2d\uff0c\u5f3a\u5316\u5b66\u4e60\u4f1a\u81ea\u5df1\u4e0e\u81ea\u5df1\u5bf9\u5f08\uff0c\u901a\u8fc7\u8bd5\u9519\u6765\u5b66\u4e60\u54ea\u4e9b\u8d70\u6cd5\u80fd\u8d62\u5f97\u6bd4\u8d5b\u3002\u6700\u7ec8\u4f7fAlphaGo\u80fd\u591f\u8d85\u8d8a\u4eba\u7c7b\u9876\u5c16\u68cb\u624b\uff0c\u751a\u81f3\u53d1\u660e\u4e86\u4e00\u4e9b\u4eba\u7c7b\u68cb\u624b\u4ece\u672a\u60f3\u5230\u8fc7\u7684\u521b\u65b0\u8d70\u6cd5<\/li><li>AlphaGo \u5bf9\u5f08\u4e2d\uff0c\u5b9e\u9645\u4e0a\u4e0b\u4e86\u4e00\u6b65\u4eba\u7c7b\u4e13\u5bb6\u901a\u5e38\u4e0d\u4f1a\u4e0b\u7684\u68cb\u3002\u8bc4\u4f30\u6765\u770b\uff0c\u8fd9\u6b65\u68cb\u88ab\u4eba\u7c7b\u73a9\u5bb6\u4e0b\u7684\u6982\u7387\u5927\u7ea6\u662f1\/10,000\u3002<\/li><\/ul><\/li><\/ul>\n\n\n\n<p>\u6240\u6709\u95ee\u9898\u90fd\u5c5e\u4e8e<strong>\u53ef\u9a8c\u8bc1<\/strong>\u9886\u57df\u3002\u4efb\u4f55\u65f6\u5019\u90fd\u53ef\u4ee5\u5f88\u5bb9\u6613\u5730\u4e0e\u4e00\u4e2a\u5177\u4f53\u7b54\u6848\u8fdb\u884c\u6bd4\u8f83\u8bc4\u5206\u3002<\/p>\n\n\n\n<p>\u57fa\u672c\u601d\u8def:<\/p>\n\n\n\n<blockquote class=\"wp-block-quote\"><p>\u8bad\u7ec3\u4eba\u7c7b\u7684\u6a21\u62df\u5668\uff0c\u5e76\u901a\u8fc7\u5f3a\u5316\u5b66\u4e60\u5bf9\u8fd9\u4e9b\u6a21\u62df\u5668\u8fdb\u884c\u4f18\u5316<\/p><\/blockquote>\n\n\n\n<p>\u4eba\u7c7b\u53cd\u9988\u4e2d\u8fdb\u884c\u5f3a\u5316\u5b66\u4e60\u7684\u4f18\u52bf<\/p>\n\n\n\n<ul><li>\u80fd\u5728\u4efb\u610f\u9886\u57df\u8fdb\u884c\u5f3a\u5316\u5b66\u4e60\uff0c\u5305\u62ec<strong>\u65e0\u6cd5\u9a8c\u8bc1<\/strong>\u7684\u9886\u57df\u3002<ul><li>\u4f8b\u5982\uff0c\u50cf\u6458\u8981\u751f\u6210\u3001\u5199\u8bd7\u3001\u7f16\u7b11\u8bdd\u6216\u4efb\u4f55\u5176\u4ed6\u521b\u610f\u5199\u4f5c<\/li><\/ul><\/li><li>RLHF \u5374\u7ed5\u8fc7\u4e86\u8fd9\u4e2a\u95ee\u9898\uff0c\u4e0d\u76f4\u63a5\u751f\u6210\uff0c\u800c\u662f\u6392\u5e8f<\/li><\/ul>\n\n\n\n<p>\u5224\u522b\u5668\u548c\u751f\u6210\u5668\u4e4b\u95f4\u7684\u5dee\u8ddd\u6709\u5173\uff1a\u5bf9\u4e8e\u4eba\u7c7b\u6765\u8bf4\uff0c\u5224\u522b\u6bd4\u751f\u6210\u8981\u5bb9\u6613\u5f97\u591a<\/p>\n\n\n\n<p>RLHF\u663e\u8457\u7f3a\u70b9<\/p>\n\n\n\n<ul><li>\u5f3a\u5316\u5b66\u4e60\u4e0d\u662f\u57fa\u4e8e\u5b9e\u9645\u7684\u4eba\u7c7b\u5224\u65ad\uff0c\u800c\u662f\u57fa\u4e8e\u4eba\u7c7b\u7684\u4e00\u4e2a<strong>\u6709\u635f\u6a21\u62df<\/strong>\uff0c\u53ef\u80fd\u4f1a\u4ea7\u751f\u8bef\u5bfc<\/li><li>\u5f3a\u5316\u5b66\u4e60\u64c5\u957f\u201c\u6b3a\u9a97\u201d\u6a21\u578b\uff0c\u8bef\u5bfc\u5176\u505a\u51fa\u8bb8\u591a\u9519\u8bef\u7684\u51b3\u5b9a\u3002<\/li><\/ul>\n\n\n\n<p><\/p>\n\n\n\n<h2 id=\"\u5956\u52b1\u6a21\u578b\">\u5956\u52b1\u6a21\u578b<\/h2>\n\n\n\n<h4 id=\"ppo-\u4e2d-rm-\u5982\u4f55\u5de5\u4f5c\">ppo \u4e2d RM \u5982\u4f55\u5de5\u4f5c<\/h4>\n\n\n\n<h4 id=\"ppo-\u4e3a\u5565\u4e0d\u76f4\u63a5\u7528-reward-model\">PPO \u4e3a\u5565\u4e0d\u76f4\u63a5\u7528 Reward Model<\/h4>\n\n\n\n<p>RLHF\u4e2d\uff0c\u4e3a\u4ec0\u4e48 PPO \u9700\u8981 Critic\u6a21\u578b \u800c\u4e0d\u662f\u76f4\u63a5\u4f7f\u7528 Reward Model \uff1f<\/p>\n\n\n\n<p>\u5f3a\u5316\u5b66\u4e60\u4e2d\uff0cPPO\uff08Proximal Policy Optimization\uff09\u57fa\u4e8e\u7b56\u7565\u68af\u5ea6\u8bad\u7ec3\u5f3a\u5316\u5b66\u4e60\u667a\u80fd\u4f53\u3002<\/p>\n\n\n\n<p>PPO\u7b97\u6cd5\u4e2d\u5f15\u5165<strong>Critic\u6a21\u578b<\/strong>\u7684\u4e3b\u8981\u76ee\u7684\uff1a\u63d0\u4f9b<strong>\u4ef7\u503c\u4f30\u8ba1\u5668<\/strong>\uff0c\u7528\u4e8e\u8bc4\u4f30<strong>\u72b6\u6001<\/strong>\u6216<strong>\u72b6\u6001\u52a8\u4f5c\u5bf9<\/strong>\u7684\u4ef7\u503c\uff0c\u4ece\u800c\u8f85\u52a9\u7b56\u7565\u7684\u66f4\u65b0\u548c\u4f18\u5316\u3002<\/p>\n\n\n\n<p>\u867d\u7136<strong>\u5956\u52b1\u6a21\u578b<\/strong>\uff08Reward Model\uff09\u53ef\u4ee5\u63d0\u4f9b\u6bcf\u4e2a\u72b6\u6001\u6216\u72b6\u6001\u52a8\u4f5c\u5bf9\u7684<strong>\u5373\u65f6\u5956\u52b1\u4fe1\u53f7<\/strong>\uff0c\u4f46\u5b83\u5e76\u4e0d\u80fd\u76f4\u63a5\u63d0\u4f9b\u5bf9\u5e94\u7684<strong>\u4ef7\u503c\u4f30\u8ba1<\/strong>\u3002<\/p>\n\n\n\n<ul><li><strong>\u5956\u52b1\u4fe1\u53f7<\/strong>\u53ea\u53cd\u6620\u4e86<strong>\u5f53\u524d\u52a8\u4f5c<\/strong>\u7684\u5373\u65f6\u53cd\u9988\uff0c\u800c\u5e76\u6ca1\u6709\u63d0\u4f9b\u5173\u4e8e\u5728<strong>\u957f\u671f\u65f6\u95f4\u5c3a\u5ea6\u4e0a<\/strong>\u7684\u4ef7\u503c\u4fe1\u606f\u3002<\/li><\/ul>\n\n\n\n<p><strong>Critic\u6a21\u578b<\/strong>\u4f30\u8ba1\u72b6\u6001\u6216\u72b6\u6001\u52a8\u4f5c\u5bf9\u7684\u957f\u671f\u4ef7\u503c\uff0c\u4e5f\u79f0\u4e3a<strong>\u72b6\u6001\u503c\u51fd\u6570<\/strong>\u6216<strong>\u52a8\u4f5c\u503c\u51fd\u6570<\/strong>\u3002Critic\u6a21\u578b\u80fd\u5b66\u4e60\u548c\u9884\u6d4b\u5728\u5f53\u524d\u72b6\u6001\u4e0b\u91c7\u53d6\u4e0d\u540c\u52a8\u4f5c\u6240\u83b7\u5f97\u7684<strong>\u7d2f\u79ef\u5956\u52b1<\/strong>\uff0c\u5b83\u63d0\u4f9b\u4e86\u5bf9\u7b56\u7565\u6539\u8fdb\u7684\u6307\u5bfc\u3002<\/p>\n\n\n\n<p>PPO\u7b97\u6cd5\u4f7f\u7528Critic\u6a21\u578b\u7684\u4f30\u8ba1\u503c\u6765\u8ba1\u7b97\u4f18\u52bf\u51fd\u6570\uff0c\u4ece\u800c\u8c03\u6574\u7b56\u7565\u7684\u66f4\u65b0\u5e45\u5ea6\uff0c\u4f7f\u5f97\u66f4\u6709\u5229\u4e8e\u4ea7\u751f\u66f4\u9ad8\u957f\u671f\u56de\u62a5\u7684\u52a8\u4f5c\u88ab\u9009\u62e9\u3002<\/p>\n\n\n\n<p>\u53e6\u5916\uff0cCritic\u6a21\u578b\u8fd8\u53ef\u7528\u4e8e\u8bc4\u4f30\u4e0d\u540c\u7b56\u7565\u7684\u6027\u80fd\uff0c\u4e3a\u6a21\u578b\u7684\u8bc4\u4f30\u548c\u9009\u62e9\u63d0\u4f9b\u4f9d\u636e\u3002PPO\u7b97\u6cd5\u4e2d\u7684<strong>Actor-Critic\u67b6\u6784<\/strong>\u5141\u8bb8\u667a\u80fd\u4f53\u540c\u65f6\u5b66\u4e60<strong>\u7b56\u7565<\/strong>\u548c<strong>\u4ef7\u503c<\/strong>\u51fd\u6570\uff0c\u5e76\u901a\u8fc7\u534f\u540c\u8bad\u7ec3\u6765\u63d0\u9ad8\u6027\u80fd\u3002<\/p>\n\n\n\n<p>\u56e0\u6b64\uff0c\u5728 RLHF\uff08Reinforcement Learning from Human Feedback\uff09\u4e2d\uff0cPPO\u7b97\u6cd5\u9700\u8981Critic\u6a21\u578b\u800c\u4e0d\u662f\u76f4\u63a5\u4f7f\u7528\u5956\u52b1\u6a21\u578b\uff0c\u662f\u4e3a\u4e86\u63d0\u4f9b\u5bf9\u72b6\u6001\u6216\u72b6\u6001\u52a8\u4f5c\u5bf9\u7684\u4ef7\u503c\u4f30\u8ba1\uff0c\u5e76\u652f\u6301\u7b56\u7565\u7684\u6539\u8fdb\u548c\u4f18\u5316\u3002Critic\u6a21\u578b\u7684\u5f15\u5165\u53ef\u4ee5\u63d0\u4f9b\u66f4\u5168\u9762\u548c\u51c6\u786e\u7684\u4fe1\u606f\uff0c\u4ece\u800c\u589e\u5f3a\u7b97\u6cd5\u7684\u8bad\u7ec3\u6548\u679c\u548c\u5b66\u4e60\u80fd\u529b\u3002<\/p>\n\n\n\n<p><\/p>\n\n\n\n<h4 id=\"\u5373\u65f6\u5956\u52b1\u548c\u957f\u671f\u5956\u52b1\">\u5373\u65f6\u5956\u52b1\u548c\u957f\u671f\u5956\u52b1<\/h4>\n\n\n\n<p><strong>\u5373\u65f6\u5956\u52b1<\/strong>&nbsp;\u4e0e \u72b6\u6001\u52a8\u4f5c\u5bf9\u7684<strong>\u957f\u671f\u4ef7\u503c<\/strong>&nbsp;\u7684\u5dee\u522b\u662f\u4ec0\u4e48\uff1f<\/p>\n\n\n\n<p><strong>\u5373\u65f6\u5956\u52b1<\/strong>\uff08Immediate Reward\uff09\u548c\u72b6\u6001\u52a8\u4f5c\u5bf9\u7684<strong>\u957f\u671f\u4ef7\u503c<\/strong>\uff08Long-Term Value\uff09\u4ee3\u8868\u4e86\u5f3a\u5316\u5b66\u4e60\u4e2d\u4e0d\u540c\u7684\u6982\u5ff5\u548c\u65f6\u95f4\u5c3a\u5ea6\u3002<\/p>\n\n\n\n<ul><li>\u5373\u65f6\u5956\u52b1\u662f\u6307\u667a\u80fd\u4f53\u5728\u6267\u884c\u67d0\u4e2a\u52a8\u4f5c\u540e<strong>\u7acb\u5373<\/strong>\u83b7\u5f97\u7684\u53cd\u9988\u4fe1\u53f7\u3002\u7531\u73af\u5883\u63d0\u4f9b\uff0c\u7528\u4e8e\u8868\u793a\u5f53\u524d\u52a8\u4f5c\u7684\u597d\u574f\u7a0b\u5ea6\u3002\u5373\u65f6\u5956\u52b1\u662f\u4e00\u79cd\u5373\u65f6\u53cd\u9988\uff0c\u53ef\u4ee5\u6307\u793a\u5f53\u524d\u52a8\u4f5c\u7684\u7acb\u5373\u7ed3\u679c\u662f\u5426\u7b26\u5408\u667a\u80fd\u4f53\u7684\u76ee\u6807\u3002<\/li><li>\u800c\u72b6\u6001\u52a8\u4f5c\u5bf9\u7684<strong>\u957f\u671f\u4ef7\u503c<\/strong>\u6d89\u53ca\u66f4\u957f\u65f6\u95f4\u5c3a\u5ea6\u4e0a\u7684\u8bc4\u4f30\uff0c\u8003\u8651\u4e86\u667a\u80fd\u4f53\u5728\u5f53\u524d\u72b6\u6001\u4e0b\u9009\u62e9\u4e0d\u540c\u52a8\u4f5c\u6240\u5bfc\u81f4\u7684\u672a\u6765\u56de\u62a5\u7684\u7d2f\u79ef\u3002\u957f\u671f\u4ef7\u503c\u53ef\u4ee5\u8868\u793a\u4e3a<strong>\u72b6\u6001\u503c\u51fd\u6570<\/strong>\uff08State Value Function\uff09\u6216<strong>\u52a8\u4f5c\u503c\u51fd\u6570<\/strong>\uff08Action Value Function\uff09\u3002<ul><li>\u72b6\u6001\u503c\u51fd\u6570\uff08V-function\uff09\u8868\u793a\u5728\u7ed9\u5b9a\u72b6\u6001\u4e0b\uff0c\u667a\u80fd\u4f53\u4ece\u8be5\u72b6\u6001\u5f00\u59cb\u6267\u884c\u4e00\u7cfb\u5217\u52a8\u4f5c\uff0c\u7136\u540e\u6309\u7167\u67d0\u4e2a\u7b56\u7565\u8fdb\u884c\u51b3\u7b56\uff0c\u4ece\u800c\u83b7\u5f97\u7684\u9884\u671f\u7d2f\u79ef\u56de\u62a5\u3002\u72b6\u6001\u503c\u51fd\u6570\u4f30\u8ba1\u4e86\u667a\u80fd\u4f53\u5904\u4e8e\u67d0\u4e2a\u72b6\u6001\u65f6\u6240\u80fd\u83b7\u5f97\u7684\u957f\u671f\u4ef7\u503c\uff0c\u53cd\u6620\u4e86\u72b6\u6001\u7684\u4f18\u52a3\u7a0b\u5ea6\u3002<\/li><li>\u52a8\u4f5c\u503c\u51fd\u6570\uff08Q-function\uff09\u5219\u8868\u793a\u5728\u7ed9\u5b9a\u72b6\u6001\u4e0b\uff0c\u667a\u80fd\u4f53\u9009\u62e9\u67d0\u4e2a\u52a8\u4f5c\u540e\uff0c\u6309\u7167\u67d0\u4e2a\u7b56\u7565\u8fdb\u884c\u51b3\u7b56\uff0c\u4ece\u8be5\u72b6\u6001\u8f6c\u79fb\u5230\u4e0b\u4e00\u4e2a\u72b6\u6001\u5e76\u83b7\u5f97\u9884\u671f\u7d2f\u79ef\u56de\u62a5\u7684\u4ef7\u503c\u3002\u52a8\u4f5c\u503c\u51fd\u6570\u4f30\u8ba1\u4e86\u5728\u7ed9\u5b9a\u72b6\u6001\u4e0b\u91c7\u53d6\u4e0d\u540c\u52a8\u4f5c\u7684\u957f\u671f\u4ef7\u503c\uff0c\u53ef\u4ee5\u5e2e\u52a9\u667a\u80fd\u4f53\u9009\u62e9\u5728\u6bcf\u4e2a\u72b6\u6001\u4e0b\u6700\u4f18\u7684\u52a8\u4f5c\u3002<\/li><\/ul><\/li><\/ul>\n\n\n\n<p>\u957f\u671f\u4ef7\u503c\u8003\u8651\u4e86\u667a\u80fd\u4f53\u5728<strong>\u672a\u6765\u51b3\u7b56<\/strong>\u8fc7\u7a0b\u4e2d\u6240\u80fd\u83b7\u5f97\u7684<strong>\u7d2f\u79ef\u56de\u62a5<\/strong><\/p>\n\n\n\n<ul><li>\u76f8\u6bd4\u4e4b\u4e0b\uff0c\u5373\u65f6\u5956\u52b1\u53ea\u63d0\u4f9b\u4e86<strong>\u5f53\u524d<\/strong>\u52a8\u4f5c\u7684\u5373\u65f6\u53cd\u9988\u3002<\/li><li>\u957f\u671f\u4ef7\u503c\u5bf9\u667a\u80fd\u4f53\u7684\u51b3\u7b56\u5177\u6709\u66f4\u5168\u9762\u7684\u5f71\u54cd\uff0c\u53ef\u4ee5\u5e2e\u52a9\u667a\u80fd\u4f53\u66f4\u597d\u5730\u8bc4\u4f30\u5f53\u524d\u72b6\u6001\u548c\u52a8\u4f5c\u7684\u957f\u671f\u6548\u679c\uff0c\u5e76\u6307\u5bfc\u667a\u80fd\u4f53\u5728\u957f\u671f\u65f6\u95f4\u5c3a\u5ea6\u4e0a\u4f5c\u51fa\u66f4\u4f18\u7684\u51b3\u7b56\u3002<\/li><\/ul>\n\n\n\n<p>\u5728\u5f3a\u5316\u5b66\u4e60\u4e2d\uff0c\u957f\u671f\u4ef7\u503c\u7684\u4f30\u8ba1\u5bf9\u4e8e\u786e\u5b9a\u6027\u7b56\u7565\u9009\u62e9\u548c\u4ef7\u503c\u4f18\u5316\u975e\u5e38\u91cd\u8981\uff0c\u800c\u5373\u65f6\u5956\u52b1\u5219\u63d0\u4f9b\u4e86\u5bf9\u5f53\u524d\u52a8\u4f5c\u7684\u76f4\u63a5\u53cd\u9988\u3002\u8fd9\u4e24\u8005\u76f8\u4e92\u8865\u5145\uff0c\u7ed3\u5408\u8d77\u6765\u53ef\u4ee5\u5e2e\u52a9\u667a\u80fd\u4f53\u5b9e\u73b0\u66f4\u597d\u7684\u51b3\u7b56\u548c\u5b66\u4e60\u6548\u679c\u3002<\/p>\n\n\n\n<p><\/p>\n\n\n\n<h4 id=\"ppo-\u4f18\u52bf\u51fd\u6570\">PPO \u4f18\u52bf\u51fd\u6570<\/h4>\n\n\n\n<p>PPO \u4e2d\u4f18\u52bf\u51fd\u6570\u6307\u4ec0\u4e48<\/p>\n\n\n\n<p>\u5728 Proximal Policy Optimization\uff08PPO\uff09\u7b97\u6cd5\u4e2d\uff0c<strong>\u4f18\u52bf\u51fd\u6570<\/strong>\uff08Advantage Function\uff09\u7528\u4e8e\u8bc4\u4f30<strong>\u72b6\u6001-\u52a8\u4f5c\u5bf9<\/strong>\u7684\u76f8\u5bf9\u4f18\u52a3\u7a0b\u5ea6\u3002\u5b83\u8861\u91cf\u4e86\u6267\u884c\u67d0\u4e2a\u52a8\u4f5c\u76f8\u5bf9\u4e8e\u5e73\u5747\u6c34\u5e73\u7684\u4f18\u52a3\uff0c\u5373\u5728\u7ed9\u5b9a\u72b6\u6001\u4e0b\u91c7\u53d6\u67d0\u4e2a\u52a8\u4f5c\u76f8\u5bf9\u4e8e\u91c7\u53d6\u5e73\u5747\u52a8\u4f5c\u7684\u6548\u679c\u3002<\/p>\n\n\n\n<p>\u4f18\u52bf\u51fd\u6570\u5b9a\u4e49\uff1a<\/p>\n\n\n\n<blockquote class=\"wp-block-quote\"><p><code>Advantage(s, a)<\/code>&nbsp;=&nbsp;<code>Q(s, a)<\/code>&nbsp;&#8211;&nbsp;<code>V(s)<\/code><\/p><\/blockquote>\n\n\n\n<p>\u5176\u4e2d<\/p>\n\n\n\n<ul><li><code>Advantage(s, a)<\/code>&nbsp;\u8868\u793a\u5728\u72b6\u6001 s \u4e0b\u91c7\u53d6\u52a8\u4f5c a \u7684\u4f18\u52bf\u51fd\u6570\u503c<\/li><li><code>Q(s, a)<\/code>&nbsp;\u8868\u793a\u72b6\u6001\u52a8\u4f5c\u5bf9 (s, a) \u7684\u52a8\u4f5c\u503c\u51fd\u6570\uff08\u4e5f\u79f0\u4e3a\u52a8\u4f5c\u4f18\u52bf\u51fd\u6570\uff09<\/li><li><code>V(s)<\/code>&nbsp;\u8868\u793a\u72b6\u6001\u503c\u51fd\u6570\u3002<\/li><\/ul>\n\n\n\n<p>\u4f18\u52bf\u51fd\u6570\u7684\u4f5c\u7528\u5728\u4e8e\u5e2e\u52a9\u8bc4\u4f30\u5f53\u524d\u52a8\u4f5c\u7684\u76f8\u5bf9\u4ef7\u503c\uff0c\u4ee5\u4fbf\u5728\u7b56\u7565\u66f4\u65b0\u8fc7\u7a0b\u4e2d\u786e\u5b9a\u5e94\u91c7\u53d6\u7684\u52a8\u4f5c\u3002\u901a\u8fc7\u6bd4\u8f83\u4e0d\u540c\u52a8\u4f5c\u7684\u4f18\u52bf\u51fd\u6570\u503c\uff0c\u53ef\u4ee5\u51b3\u5b9a\u54ea\u4e9b\u52a8\u4f5c\u662f\u66f4\u597d\u7684\u9009\u62e9\u3002\u6b63\u7684\u4f18\u52bf\u51fd\u6570\u503c\u8868\u793a\u6267\u884c\u7684\u52a8\u4f5c\u6bd4\u5e73\u5747\u6c34\u5e73\u66f4\u597d\uff0c\u800c\u8d1f\u7684\u4f18\u52bf\u51fd\u6570\u503c\u8868\u793a\u6267\u884c\u7684\u52a8\u4f5c\u6bd4\u5e73\u5747\u6c34\u5e73\u66f4\u5dee\u3002<\/p>\n\n\n\n<p>\u5728PPO\u7b97\u6cd5\u4e2d\uff0c\u4f18\u52bf\u51fd\u6570\u7528\u4e8e\u8ba1\u7b97\u7b56\u7565\u66f4\u65b0\u7684\u76ee\u6807\uff0c\u4ee5\u4fbf\u8c03\u6574\u7b56\u7565\u6982\u7387\u5206\u5e03\u6765\u63d0\u9ad8\u4f18\u52bf\u51fd\u6570\u4e3a\u6b63\u7684\u52a8\u4f5c\u7684\u6982\u7387\uff0c\u5e76\u964d\u4f4e\u4f18\u52bf\u51fd\u6570\u4e3a\u8d1f\u7684\u52a8\u4f5c\u7684\u6982\u7387\uff0c\u4ece\u800c\u6539\u8fdb\u7b56\u7565\u7684\u6027\u80fd\u3002<\/p>\n\n\n\n<h1>\u76f8\u5173\u8bba\u6587\uff1a<\/h1>\n\n\n\n<ul class=\"has-medium-pink-color has-text-color\"><li><strong>A Comprehensive Survey of LLM Alignment Techniques: RLHF, RLAIF, PPO, DPO and More. <\/strong><a href=\"https:\/\/arxiv.org\/pdf\/2407.16216\"><strong>https:\/\/arxiv.org\/pdf\/2407.16216<\/strong><\/a><\/li><li><strong>Proximal Policy Optimization Algorithms. <\/strong> <a href=\"https:\/\/arxiv.org\/abs\/1707.06347\"><strong>https:\/\/arxiv.org\/abs\/1707.06347<\/strong><\/a><\/li><\/ul>\n\n\n\n<ul><li>KTO\uff0cKahneman-Tversky \u4f18\u5316\uff0c\u53c2\u9605\u8bba\u6587\u300aKTO: Model alignment as prospect theoretic optimization\u300b\u3002<\/li><li>DRO\uff0c\u76f4\u63a5\u5956\u52b1\u4f18\u5316\uff0c\u53c2\u9605\u8bba\u6587\u300aOffline regularised reinforcement learning for large language models alignment\u300b\u3002<\/li><li>SimPO\uff0c\u7b80\u5355\u504f\u597d\u4f18\u5316\uff0c\u53c2\u9605\u8bba\u6587\u300aSimPO: Simple preference optimization with a reference-free reward\u300b<\/li><\/ul>\n","protected":false},"excerpt":{"rendered":"<p>\u53c2\u8003\u8bba\u6587\uff1aA Comprehensive Survey of LLM Alignment Techniques &hellip; <a href=\"http:\/\/139.9.1.231\/index.php\/2025\/02\/05\/llm-alignment-techniques-rlhf-rlaif-ppo-dpo-and-more\/\" class=\"more-link\">\u7ee7\u7eed\u9605\u8bfb<span class=\"screen-reader-text\">LLM\u8bad\u7ec3-\u4eba\u5de5\u5f3a\u5316\u53cd\u9988\u5bf9\u9f50\u7b97\u6cd5\uff1aRLHF, RLAIF, PPO, DPO and More<\/span><\/a><\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":[],"categories":[42,4,9,38],"tags":[],"_links":{"self":[{"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/posts\/24325"}],"collection":[{"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/comments?post=24325"}],"version-history":[{"count":569,"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/posts\/24325\/revisions"}],"predecessor-version":[{"id":27841,"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/posts\/24325\/revisions\/27841"}],"wp:attachment":[{"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/media?parent=24325"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/categories?post=24325"},{"taxonomy":"post_tag","embeddable":true,"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/tags?post=24325"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}