{"id":11785,"date":"2023-01-20T19:48:00","date_gmt":"2023-01-20T11:48:00","guid":{"rendered":"http:\/\/139.9.1.231\/?p=11785"},"modified":"2023-01-27T18:04:01","modified_gmt":"2023-01-27T10:04:01","slug":"blip-coca-and-beitv","status":"publish","type":"post","link":"http:\/\/139.9.1.231\/index.php\/2023\/01\/20\/blip-coca-and-beitv\/","title":{"rendered":"\u591a\u6a21\u6001|BLIP \u3001CoCa and  BeiTv"},"content":{"rendered":"\n\n\n<h2>BLIP<\/h2>\n\n\n\n<p class=\"has-text-align-center\"><em>BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation<\/em><\/p>\n\n\n\n<p class=\"has-light-pink-background-color has-background\"><strong>\u4ee3\u7801:  <\/strong><a rel=\"noreferrer noopener\" href=\"https:\/\/github.com\/salesforce\/BLIP\" target=\"_blank\"><em>https:\/\/github.com\/salesforce\/BLIP<\/em><\/a><\/p>\n\n\n\n<p>         \u672c\u6587\u662f ALBEF \u539f\u73ed\u4eba\u9a6c\u505a\u7684\uff0c\u57fa\u672c\u53ef\u4ee5\u770b\u505a\u5438\u6536\u4e86 VLMo \u601d\u60f3\u7684 ALBEF\u3002\u8bad\u7ec3\u7684 loss \u548c\u6280\u5de7\u90fd\u4e0e ALBEF \u4e00\u81f4\uff0c\u5c5e\u4e8e ALBEF \u7684\u540e\u7eed\u5de5\u4f5c\u3002<\/p>\n\n\n\n<p>        \u672c\u6587motivation\u4e3b\u8981\u6709\u4e24\u4e2a\uff1a\u4e00\u662f\u4e4b\u524d\u591a\u6a21\u6001\u9884\u8bad\u7ec3\u6a21\u578b\u7ed3\u6784\u8981\u4e48\u662f\u57fa\u4e8e\u7f16\u7801\u5668\uff0c\u4e0d\u80fd\u76f4\u63a5\u7528\u4e8e\u751f\u6210\u4efb\u52a1\uff0c\u8981\u4e48\u662f\u57fa\u4e8e\u7f16\u7801\u89e3\u7801\u5668\uff0c\u5728\u68c0\u7d22\u7c7b\u4efb\u52a1\u4e0a\u4e0d\u65b9\u4fbf\uff0c\u672c\u6587\u8bbe\u8ba1\u7684\u7ed3\u6784\u5305\u542b\u5355\u6a21\u6001\u7f16\u7801\u5668\u3001\u89c6\u89c9\u6307\u5bfc\u6587\u672c\u7f16\u7801\u5668\u3001\u89c6\u89c9\u6307\u5bfc\u6587\u672c\u89e3\u7801\u5668\uff0c\u53ef\u4ee5\u65b9\u4fbf\u5730\u7528\u5bf9\u6bd4\u5b66\u4e60\u3001ITM\uff08Image-Text Matching ( ITM ): \u56fe\u6587\u5339\u914d\u4efb\u52a1\uff0c\u9488\u5bf9\u7684\u662f\u56fe\u6587\u4ea4\u4e92\u6d41\uff0c\u5373\u5224\u65ad\u5f53\u524dpair\u662f\u4e0d\u662f\u5339\u914d\uff08\u5c31\u662f\u4e2a\u5206\u7c7b\u4efb\u52a1\uff09\uff09\u3001LM\uff08\u751f\u6210\u5f0f\u4efb\u52a1\uff09\u4e09\u4e2a\u9884\u8bad\u7ec3\u4efb\u52a1\u8bad\u7ec3\u4e0d\u540c\u7684\u6a21\u5757\uff0c\u4e5f\u5bb9\u6613\u8fc1\u79fb\u5230\u5404\u79cd\u4e0b\u6e38\u4efb\u52a1\u4e2d\uff1b\u4e8c\u662f\u4e4b\u524d\u7684\u5f88\u591a\u5de5\u4f5c\u901a\u8fc7\u6269\u5145\u4e86\u7f51\u4e0a\u641c\u96c6\u7684\u56fe\u6587\u5bf9\u7684\u9884\u8bad\u7ec3\u6570\u636e\uff08GCC\u3001SBU\u3001CC12M\uff09\uff0c\u63d0\u9ad8\u4e86\u6a21\u578b\u6548\u679c\uff0c\u4f46\u5ffd\u7565\u4e86\u5176\u4e2d\u6709\u5f88\u591a\u4e0d\u5bf9\u9f50\u7684\u566a\u58f0\u60c5\u51b5\uff0c\u672c\u6587\u7528\u4e00\u4e2aboostrapping\u7684\u65b9\u6cd5\uff0c\u7528captioner\u4e3a\u7f51\u7edc\u56fe\u7247\u751f\u6210\u63cf\u8ff0\uff0c\u7528filter\u8fc7\u6ee4\u6389\u4e0d\u914d\u5bf9\u7684\u6570\u636e\uff0c\u4ece\u800c\u964d\u4f4e\u566a\u58f0\uff0c\u66f4\u9ad8\u6548\u5730\u5229\u7528\u7f51\u7edc\u4e0a\u7684\u6570\u636e\u3002<\/p>\n\n\n\n<p>\u5173\u952e\u7684\u6539\u8fdb\uff1a<\/p>\n\n\n\n<p>1. \u6a21\u578b\u7ed3\u6784\u4e0a\u6574\u5408\u4e86 ALBEF \u548c\u548c VLMo\u3002VLMo \u53c2\u6570\u5171\u4eab\uff0c\u4f46\u662f\u4e0d\u5b58\u5728\u5355\u72ec\u7f16\u7801\u5668\uff1bALBEF \u5b58\u5728\u5355\u72ec\u7f16\u7801\u5668\u4f46\u662f\u90e8\u5206\u53c2\u6570\u4e0d\u5171\u4eab\u3002\u8fd9\u7bc7\u8bba\u6587\u5b58\u5728\u5355\u72ec\u7684 vision encoder \u548c text encoder\u3002\u591a\u6a21\u6001\u7684\u53c2\u6570\u662f\u4ee5 cross-attention \u6a21\u5757\u63d2\u5165\u5230\u6587\u672c\u7f16\u7801\u5668\u5b9e\u73b0\u7684\uff0ccross-attention \u6a21\u5757\u4eab\u53d7\u6587\u672c\u7f16\u7801\u5668\u7684\u53c2\u6570\uff08\u53ef\u4ee5\u770b col 2 \u548c col3\uff09<\/p>\n\n\n\n<p>2. \u589e\u52a0\u4e86\u89e3\u7801\u5668\uff08\u53c2\u8003 col 4\uff09\uff0c\u4e3a\u4e86\u505a\u751f\u6210\u4efb\u52a1\u3002\u89e3\u7801\u5668\u62ff\u5230\u89c6\u89c9\u7279\u5f81\u548c\u672a\u63a9\u7801\u7684\u8bed\u8a00\u7279\u5f81\uff0c\u8fc7\u4e00\u4e2a casual self-attention \u5c42\uff0c\u505a GPT \u7528\u7684\u90a3\u79cd lm \u4efb\u52a1\u3002\u8fd9\u91cc\u533a\u522b\u4e8e MLM \u7684\u90a3\u79cd mask \u673a\u5236\uff0c\u662f\u901a\u8fc7 causal self-attention \u6765\u5b9e\u73b0\u56e0\u679c\u63a8\u7406\u7684\uff0c\u6211\u6b64\u65f6\u8fd8\u4e0d\u719f\u6089\u8fd9\u4e2a\u8fc7\u7a0b\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"1024\" height=\"580\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-23-1024x580.png\" alt=\"\" class=\"wp-image-11807\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-23-1024x580.png 1024w, http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-23-300x170.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-23-768x435.png 768w, http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-23.png 1148w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p>3. \u9664\u4e86\u4e0a\u9762\u7684\u4e3b\u8981\u90e8\u5206\uff0c\u8fd8\u6709\u4e00\u4e2a\u91cd\u8981\u7684\u90e8\u5206\u662f\u5229\u7528\u8bad\u7ec3\u597d\u7684\u6a21\u578b\u751f\u6210\u4f2a\u6807\u7b7e\u3002\u5c06\u8bad\u7ec3\u597d\u7684\u6a21\u578b\u91cc\u7684\u4e0d\u540c\u7684\u90e8\u5206\u62ff\u51fa\u6765\u5728 COCO \u4e0a\u7a0d\u5fae\u5fae\u8c03\u4e00\u4e0b\uff0cdecoder \u90e8\u5206\u53ef\u4ee5\u751f\u6210\u6587\u672c\uff0c\u7b97 ITM loss \u7684\u90a3\u4e2a\u6a21\u5757\u53ef\u4ee5\u505a image-text pair \u7684\u8fc7\u6ee4\uff0c\u901a\u8fc7\u8f93\u51fa\u6253\u5206\u3001\u7f6e\u4fe1\u5ea6\u7684\u65b9\u5f0f\u3002\u5728\u5b9e\u9a8c\u4e2d\uff0cBLIP \u7684\u89e3\u7801\u80fd\u529b\u4f3c\u4e4e\u5f88\u5f3a\uff0c\u7528\u8fd9\u79cd\u8303\u5f0f\u751f\u6210\u7684\u6587\u672c\u4e0d\u4ec5\u4eba\u770b\u7740\u89c9\u5f97\u4e0d\u9519\uff0c\u7528\u4e8e\u81ea\u8bad\u7ec3\u540e\u4e5f\u53ef\u4ee5\u6da8\u70b9 2-3\uff0c\u975e\u5e38\u663e\u7740\u3002<\/p>\n\n\n\n<p>&nbsp;&nbsp;       \u4e00\u4e2a\u4f8b\u5b50\u662f stable diffusion \u7684\u5b98\u65b9\u535a\u6587\u91cc\u63d0\u5230\u4e86\uff0c\u4ed6\u4eec\u5728\u505a\u5fae\u8c03\u65f6\uff0c\u4f1a\u9047\u5230\u6570\u636e\u96c6\u53ea\u6709\u56fe\u7247\u6ca1\u6709 caption \u7684\u60c5\u51b5\uff0c\u6bd4\u5982 pokeman \u6570\u636e\u3002\u4ed6\u4eec\u7528 BLIP \u6765\u505acaption\u751f\u6210\uff0c\u7136\u540e\u5fae\u8c03 stable diffusion \u53d1\u73b0\u6548\u679c\u5f88\u597d\u3002<\/p>\n\n\n\n<p>&nbsp;&nbsp;       \u53e6\u4e00\u4e2a\u4f8b\u5b50\u662f\u77e5\u540d\u7684\u5f00\u6e90\u591a\u6a21\u6001\u6570\u636e\u96c6 LAION\uff0c\u4ed6\u4eec\u4e5f\u7528\u4e86 BLIP \u6765\u8f85\u52a9\u5236\u4f5c\u6570\u636e\u96c6\u3002\u4ed6\u4eec\u7684\u8fc7\u7a0b\u5728\u5b98\u7f51\u516c\u5e03\u4e86\uff0c\u53ef\u4ee5\u53c2\u8003\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"1024\" height=\"425\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-24-1024x425.png\" alt=\"\" class=\"wp-image-11808\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-24-1024x425.png 1024w, http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-24-300x125.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-24-768x319.png 768w, http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-24.png 1120w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p class=\"has-light-blue-background-color has-background\"><strong>\u603b\u7ed3\uff1a\u4e2a\u4eba\u611f\u89c9\u6a21\u578b\u90e8\u5206\u7684\u6539\u8fdb\u53ef\u80fd\u6709\u7528\u53ef\u80fd\u6ca1\u6709\u7528\uff0c\u4f46\u662f\u89e3\u7801\u5668\u8f93\u51fa\u7684 caption \u786e\u5b9e\u662f\u4e0d\u9519\u3002\u4ee5\u81f3\u4e8e\u5f88\u591a\u4e0b\u6e38\u4efb\u52a1\u90fd\u62ff BLIP \u6765\u751f\u6210 caption\u3002<\/strong><\/p>\n\n\n\n<h2>CoCa<\/h2>\n\n\n\n<p class=\"has-text-align-center\"><em>Contrastive Captioners are Image-Text Foundation Models<\/em><\/p>\n\n\n\n<p class=\"has-light-pink-background-color has-background\"><strong>\u4ee3\u7801:<\/strong> <a href=\"https:\/\/github.com\/lucidrains\/CoCa-pytorch\" target=\"_blank\" rel=\"noreferrer noopener\"><em>https:\/\/github.com\/lucidrains\/CoCa-pytorch<\/em><\/a><\/p>\n\n\n\n<p>\u5b83\u4e5f\u662f ALBEF \u7684\u540e\u7eed\u5de5\u4f5c\uff0c\u6a21\u578b\u975e\u5e38\u50cf\u3002\u533a\u522b\u5728\u4e8e\uff1a<\/p>\n\n\n\n<p>1. \u56fe\u50cf\u7528\u4e86 attentional pooling\uff0c\u8fd9\u5728\u672c\u6587\u7684\u5b9e\u9a8c\u4e2d\u6709\u6548<\/p>\n\n\n\n<p>2. \u53bb\u6389\u4e86 ITM loss\uff0c\u76ee\u7684\u662f\u52a0\u5feb\u8bad\u7ec3\uff0c\u539f\u672c\u6587\u672c\u9700\u8981 forward 2-3 \u6b21\uff0c\u53bb\u6389 ITM loss \u4e4b\u540e\u53ea\u9700\u8981 forward \u4e00\u6b21\u5c31\u53ef\u4ee5\u4e86\u3002\u5728 ALBEF \u4e2d\uff0cITM \u9700\u8981\u5b8c\u6574\u7684 text\uff0c\u800c MLM \u9700\u8981\u63a9\u7801\uff0c\u6240\u4ee5\u662f\u4e24\u6b21\u8f93\u5165\u3002\u5728 BLIP \u4e2d\uff0cITC \u4e00\u6b21\uff0cITM \u56e0\u4e3a\u5728\u6587\u672c\u6a21\u578b\u4e2d\u63d2\u5165\u4e86\u65b0\u7684\u6a21\u5757\uff0c\u6240\u4ee5\u5f97\u5355\u72ec\u505a\u524d\u5411\u3002\u800c LM \u56e0\u4e3a\u7528\u4e86\u65e2\u591a\u4e86\u65b0\u7684\u6a21\u5757\u53c8\u5f97\u7528 causal self-attention \u6240\u4ee5\u53c8\u5f97\u5355\u72ec\u505a\u4e00\u6b21\u3002\u5728 CoCa \u4e2d\uff0c\u4e3a\u4e86\u5b8c\u6210 captioning loss \u548c ITC loss\uff0c\u53ea\u9700\u8981\u505a\u4e00\u6b21\u524d\u5411\u5373\u53ef\u3002GPT \u4e2d\u628a cls-token \u653e\u5728\u6700\u540e\u9762\u5c31\u53ef\u4ee5\u5f97\u5230\u5168\u5c40\u8868\u5f81\u6765\u505a ITC loss \u4e86\u3002<\/p>\n\n\n\n<p>\u7b80\u5355\u5feb\u901f\u7684\u65b9\u6cd5\u53ef\u4ee5\u6709\u6548\u5730 scale\uff0c\u800c\u6211\u4eec\u77e5\u9053\u590d\u6742\u7684\u6a21\u578b\u8bbe\u8ba1\u3001loss \u8bbe\u8ba1\u7ecf\u5e38\u4e0d\u5982\u7b80\u5355\u5730\u653e\u5927\u6a21\u578b\u3001\u589e\u52a0\u6570\u636e\u6709\u6548\u3002\u53c2\u8003\u51ef\u660e\u7684 FLYP\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"1024\" height=\"466\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-25-1024x466.png\" alt=\"\" class=\"wp-image-11815\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-25-1024x466.png 1024w, http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-25-300x136.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-25-768x349.png 768w, http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-25.png 1471w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"1024\" height=\"474\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-27-1024x474.png\" alt=\"\" class=\"wp-image-11818\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-27-1024x474.png 1024w, http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-27-300x139.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-27-768x355.png 768w, http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-27.png 1070w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p>\u8fd9\u79cd\u753b\u56fe\u7684\u65b9\u5f0f\u5f88\u4e0d\u9519\uff0c\u5f88\u76f4\u89c2\u3002\u53ef\u4ee5\u53c2\u8003\uff0c\u4ee5\u540e\u4e5f\u753b\u6210\u8fd9\u6837\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"1024\" height=\"785\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-29-1024x785.png\" alt=\"\" class=\"wp-image-11820\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-29-1024x785.png 1024w, http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-29-300x230.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-29-768x589.png 768w, http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-29.png 1083w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p class=\"has-light-blue-background-color has-background\"><strong>\u603b\u7ed3\uff1a<\/strong><\/p>\n\n\n\n<p>\u7b80\u5355\u6709\u6548\u7684\u7ed3\u6784\u8bbe\u8ba1\uff0c\u6211\u5bf9 CoCa \u7684\u5370\u8c61\u662f\u7b80\u5355\u6709\u6548\u3002\u5b83\u7684\u5cf0\u503c\u6027\u80fd\u6211\u6ca1\u6709\u611f\u89c9\u5f88\u70b8\u88c2\uff0c\u53ef\u80fd\u662f\u6a21\u578b\u3001\u6570\u636e scale \u4e4b\u540e\u81ea\u7136\u7684\u7ed3\u679c\u3002\u4f46\u662f\u5b83\u7684 zero-shot \u6027\u80fd\u8ba9\u6211\u5370\u8c61\u5f88\u6df1\u523b\uff0c\u5728 imagenet \u4e0a\u5fae\u8c03\u4e0d\u5fae\u8c03\u7684\u5dee\u8ddd\u5f88\u5c0f\uff0c\u8fd9\u4e00\u70b9\u975e\u5e38\u975e\u5e38\u5173\u952e\u3002<\/p>\n\n\n\n<p>\u8bfb\u5230 coca\uff0c\u6211\u5bf9\u591a\u6a21\u6001\u7684\u7591\u95ee\u8fd8\u6709\u4e24\u70b9\uff1a<\/p>\n\n\n\n<p>1. mixture of experts \u7684\u7ed3\u6784\u6ca1\u6709\u5728\u672c\u6587\u4e2d\u5f97\u5230\u5e94\u7528\uff0c\u4f46\u6211\u611f\u89c9\u662f\u4e2a\u76f8\u5f53\u6709\u524d\u9014\u7684\u7ed3\u6784<\/p>\n\n\n\n<p>2. \u53cc\u5411\u7684\u751f\u6210 loss \u8fd8\u662f\u6ca1\u4eba\u505a\uff0c\u8c01\u8bf4\u53ea\u80fd\u56fe\u50cf\u8f85\u52a9\u6587\u672c?<\/p>\n\n\n\n<h2>BeiTv<\/h2>\n\n\n\n<p class=\"has-text-align-center\"><em>(BEiT-3) Image as a Foreign Language: BEiT Pretraining for All Vision and Vision-Language Tasks<\/em><\/p>\n\n\n\n<p>       \u8bba\u6587\u7684\u5356\u70b9\u662f\u5927\u4e00\u7edf\u3002\u5728 introduction \u7ae0\u8282\u8be6\u7ec6\u4ecb\u7ecd\u4e86\u5927\u4e00\u7edf\u6307\u7684\u662f\u7edf\u4e00\u6a21\u578b\u3001loss \u548c\u6570\u636e\u3002\u6211\u89c9\u5f97\u53ef\u4ee5\u7b80\u5355\u5730\u6982\u62ec\u4e3a\uff1a\u7528\u7edf\u4e00\u7684 multi-way transformer (mixture of experts ) \u67b6\u6784\u548c\u5355\u4e2a masked modeling loss\uff0c\u5c06\u4efb\u610f\u6a21\u6001\u770b\u505a\u662f\u540c\u4e00\u4e2a\u6a21\u6001\u6765\u5efa\u6a21\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image size-full is-resized\"><img loading=\"lazy\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-31.png\" alt=\"\" class=\"wp-image-11829\" width=\"690\" height=\"299\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-31.png 922w, http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-31-300x130.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-31-768x333.png 768w\" sizes=\"(max-width: 690px) 100vw, 690px\" \/><\/figure>\n\n\n\n<p>       \u5177\u4f53\u800c\u8a00\uff0c\u5b83\u6307\u7684\u662f\u5728\u5c06\u4efb\u610f\u6a21\u6001\u8f93\u5165\u7f51\u7edc\u540e\uff0c\u90fd\u8868\u73b0\u4e3a list of tokens\uff0c\u76f4\u63a5\u5c06\u5b83\u4eec\u770b\u505a\u662f\u76f8\u540c\u7684\u6a21\u6001\u6765\u505a masked modeling \u5c31\u597d\u4e86\u3002\u5982\u679c\u60f3\u8981\u62ff\u8fc7\u53bb\u505a\u4e0b\u6e38\u4efb\u52a1\u7684\u8bdd\uff0c\u76f4\u63a5\u5c06\u9700\u8981\u7684\u90a3\u90e8\u5206\u6a21\u578b\u62ff\u51fa\u6765\u5373\u53ef\u3002\u6bd4\u5982\u505a\u89c6\u89c9\u4efb\u52a1\u5c31\u62ff\u89c6\u89c9\u6a21\u578b\uff0c\u505a\u8bed\u8a00\u4efb\u52a1\u5c31\u62ff\u8bed\u8a00\u6a21\u578b\u3002\u5982\u679c\u662f\u505a\u591a\u6a21\u6001\u4efb\u52a1\uff0c\u53ef\u4ee5\u7075\u6d3b\u5730\u6a21\u62df\u4e0d\u540c\u7684\u9700\u6c42\uff0c\u6bd4\u5982\uff1a1. \u505a\u751f\u6210\u4efb\u52a1\u53ef\u4ee5\u62ff\u591a\u6a21\u6001\u90e8\u5206\u7684\u53c2\u6570\u51fa\u6765 2. \u505a\u56fe\u6587\u68c0\u7d22\u53ef\u4ee5\u5355\u72ec\u53d6\u51fa\u89c6\u89c9\u90e8\u5206\u548c\u8bed\u8a00\u90e8\u5206\u6765\u6a21\u62df CLIP\u3002\u4e0d\u4ec5\u4ec5\u662f\u80fd\u505a\u4efb\u610f\u4efb\u52a1\uff0c\u8fd8\u7ee7\u627f\u4e86\u524d\u4f5c\u7684\u4f18\u70b9\uff0c\u6bd4\u5982 CLIP \u8fd9\u79cd\u5f31\u8de8\u6a21\u6001\u4ea4\u4e92\u5e26\u6765\u7684\u8ba1\u7b97\u6548\u7387\u7684\u4f18\u52bf\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" width=\"944\" height=\"593\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-30.png\" alt=\"\" class=\"wp-image-11822\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-30.png 944w, http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-30-300x188.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-30-768x482.png 768w\" sizes=\"(max-width: 944px) 100vw, 944px\" \/><\/figure>\n\n\n\n<p>\u603b\u7ed3:<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"1024\" height=\"469\" src=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-32-1024x469.png\" alt=\"\" class=\"wp-image-11833\" srcset=\"http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-32-1024x469.png 1024w, http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-32-300x137.png 300w, http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-32-768x352.png 768w, http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-32-1536x703.png 1536w, http:\/\/139.9.1.231\/wp-content\/uploads\/2023\/01\/image-32-2048x938.png 2048w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p><\/p>\n","protected":false},"excerpt":{"rendered":"<p>BLIP BLIP: Bootstrapping Language-Image Pre-training fo &hellip; <a href=\"http:\/\/139.9.1.231\/index.php\/2023\/01\/20\/blip-coca-and-beitv\/\" class=\"more-link\">\u7ee7\u7eed\u9605\u8bfb<span class=\"screen-reader-text\">\u591a\u6a21\u6001|BLIP \u3001CoCa and  BeiTv<\/span><\/a><\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":[],"categories":[21,4,9],"tags":[],"_links":{"self":[{"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/posts\/11785"}],"collection":[{"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/comments?post=11785"}],"version-history":[{"count":37,"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/posts\/11785\/revisions"}],"predecessor-version":[{"id":11836,"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/posts\/11785\/revisions\/11836"}],"wp:attachment":[{"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/media?parent=11785"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/categories?post=11785"},{"taxonomy":"post_tag","embeddable":true,"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/tags?post=11785"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}