{"id":3668,"date":"2022-04-09T15:59:02","date_gmt":"2022-04-09T07:59:02","guid":{"rendered":"http:\/\/139.9.1.231\/?p=3668"},"modified":"2022-04-09T15:59:04","modified_gmt":"2022-04-09T07:59:04","slug":"attentionmulti-head-attentio","status":"publish","type":"post","link":"http:\/\/139.9.1.231\/index.php\/2022\/04\/09\/attentionmulti-head-attentio\/","title":{"rendered":"Self Attention\u548cMulti-Head Attention\u7684\u539f\u7406"},"content":{"rendered":"\n<p><strong>Self Attention\u539f\u7406<\/strong><\/p>\n\n\n\n<p>&nbsp;&nbsp;&nbsp;&nbsp;self attention\u6709\u4ec0\u4e48\u4f18\u70b9\u5462\uff0c\u8fd9\u91cc\u5f15\u7528\u8c37\u6b4c\u8bba\u6587\u300aAttention Is All You Need\u300b\u91cc\u9762\u8bf4\u7684\uff0c\u7b2c\u4e00\u662f\u8ba1\u7b97\u590d\u6742\u5ea6\u5c0f\uff0c\u7b2c\u4e8c\u662f\u53ef\u4ee5\u5927\u91cf\u7684\u5e76\u884c\u8ba1\u7b97\uff0c\u7b2c\u4e09\u662f\u53ef\u4ee5\u66f4\u597d\u7684\u5b66\u4e60\u8fdc\u8ddd\u79bb\u4f9d\u8d56\u3002Attention\u7684\u8ba1\u7b97\u516c\u5f0f\u5982\u4e0b\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img src=\"https:\/\/imgconvert.csdnimg.cn\/aHR0cDovL3d3dy5jaGVuamlhbnF1LmNvbS9tZWRpYS91cGltZy8wXzIwMTkwOTIxMjIxNTUzXzU1Mi5wbmc?x-oss-process=image\/format,png\" alt=\"0.png\"\/><\/figure>\n\n\n\n<p>&nbsp; &nbsp; \u4e0b\u9762\u4e00\u6b65\u6b65\u5206\u89e3self attention\u7684\u8ba1\u7b97\u8fc7\u7a0b\uff08\u56fe\u6765\u81ea<a href=\"https:\/\/jalammar.github.io\/illustrated-transformer\/\" target=\"_blank\" rel=\"noreferrer noopener\">https:\/\/jalammar.github.io\/illustrated-transformer\/<\/a>\uff09\uff1a<\/p>\n\n\n\n<ol><li>\u8f93\u5165\u5355\u8bcd\u8868\u793a\u5411\u91cf\uff0c\u6bd4\u5982\u53ef\u4ee5\u662f<a href=\"https:\/\/so.csdn.net\/so\/search?q=%E8%AF%8D%E5%90%91%E9%87%8F&amp;spm=1001.2101.3001.7020\" target=\"_blank\" rel=\"noreferrer noopener\">\u8bcd\u5411\u91cf<\/a>\u3002<\/li><li>\u628a\u8f93\u5165\u5411\u91cf\u6620\u5c04\u5230q\u3001k\u3001v\u4e09\u4e2a\u53d8\u91cf\uff0c\u5982\u4e0b\u56fe\uff1a<img loading=\"lazy\" alt=\"1.png\" height=\"349\" src=\"https:\/\/imgconvert.csdnimg.cn\/aHR0cDovL3d3dy5jaGVuamlhbnF1LmNvbS9tZWRpYS91cGltZy8xXzIwMTkwOTIxMjIwNTAyXzM3Mi5wbmc?x-oss-process=image\/format,png\" width=\"566\">\u6bd4\u5982\u4e0a\u56feX1\u548cX2\u5206\u522b\u662fThinking\u548cMachines\u8fd9\u4e24\u4e2a\u5355\u8bcd\u7684\u8bcd\u5411\u91cf\uff0cq1\u548cq2\u88ab\u79f0\u4e3a\u67e5\u8be2\u5411\u91cf\uff0ck\u79f0\u4e3a\u952e\u5411\u91cf\uff0cv\u79f0\u4e3a\u503c\u5411\u91cf\u3002Wq\uff0cWk\uff0cWv\u90fd\u662f\u968f\u673a\u521d\u59cb\u5316\u7684\u6620\u5c04\u77e9\u9635\u3002<\/li><li>\u8ba1\u7b97Attention score\uff0c\u5373\u67d0\u4e2a\u5355\u8bcd\u7684\u67e5\u8be2\u5411\u91cf\u548c\u5404\u4e2a\u5355\u8bcd\u5bf9\u5e94\u7684\u952e\u5411\u91cf\u7684\u5339\u914d\u5ea6\uff0c\u5339\u914d\u5ea6\u53ef\u4ee5\u901a\u8fc7\u52a0\u6cd5\u6216\u70b9\u79ef\u5f97\u5230\u3002\u56fe\u5982\u4e0b\uff1a<img loading=\"lazy\" alt=\"2.png\" height=\"293\" src=\"https:\/\/imgconvert.csdnimg.cn\/aHR0cDovL3d3dy5jaGVuamlhbnF1LmNvbS9tZWRpYS91cGltZy8yXzIwMTkwOTIxMjIxMzIyXzk2NS5wbmc?x-oss-process=image\/format,png\" width=\"553\"><\/li><li>\u51cf\u5c0fscore\uff0c\u5e76\u5c06score\u8f6c\u6362\u4e3a\u6743\u91cd\u3002<img loading=\"lazy\" alt=\"3.png\" height=\"351\" src=\"https:\/\/imgconvert.csdnimg.cn\/aHR0cDovL3d3dy5jaGVuamlhbnF1LmNvbS9tZWRpYS91cGltZy8zXzIwMTkwOTIxMjIxMjQwXzU2NC5wbmc?x-oss-process=image\/format,png\" width=\"554\">\u5176\u4e2ddk\u662fq k v\u7684\u7ef4\u5ea6\u3002score\u53ef\u4ee5\u901a\u8fc7\u70b9\u79ef\u548c\u52a0\u6cd5\u5f97\u5230\uff0c\u5f53dk\u8f83\u5c0f\u65f6\uff0c\u8fd9\u4e24\u79cd\u65b9\u6cd5\u5f97\u5230\u7684\u7ed3\u679c\u5f88\u76f8\u4f3c\u3002\u4f46\u662f\u70b9\u79ef\u7684\u901f\u5ea6\u66f4\u5feb\u548c\u7701\u7a7a\u95f4\u3002\u4f46\u662f\u5f53dk\u8f83\u5927\u65f6\uff0c\u52a0\u6cd5\u8ba1\u7b97score\u4f18\u4e8e\u70b9\u79ef\u7ed3\u679c\u6ca1\u6709\u9664\u4ee5dk^0.5\u7684\u60c5\u51b5\u3002\u539f\u56e0\u53ef\u80fd\u662f\uff1athe dot products grow large in magnitude, pushing the softmax function into regions where it has extremely small gradients\u3002\u6240\u4ee5\u8981\u5148\u9664\u4ee5dk^0.5\uff0c\u518d\u8fdb\u884csoftmax\u3002<\/li><li>\u6743\u91cd\u4e58\u4ee5v\uff0c\u5e76\u6c42\u548c\u3002<img loading=\"lazy\" alt=\"4.png\" height=\"523\" src=\"https:\/\/imgconvert.csdnimg.cn\/aHR0cDovL3d3dy5jaGVuamlhbnF1LmNvbS9tZWRpYS91cGltZy80XzIwMTkwOTIxMjIyNTU3XzM1NS5wbmc?x-oss-process=image\/format,png\" width=\"621\">\u6700\u7ec8\u7684\u7ed3\u679cz\u5c31\u662fx1\u8fd9\u4e2a\u5355\u8bcd\u7684Attention\u5411\u91cf\u3002\u5f53\u540c\u65f6\u8ba1\u7b97\u6240\u6709\u5355\u8bcd\u7684Attention\u65f6\uff0c\u56fe\u793a\u5982\u4e0b\uff1a1.&nbsp;\u5c06\u8f93\u5165\u8bcd\u5411\u91cf\u8f6c\u6362\u4e3aQ\u3001K\u3001V.<img loading=\"lazy\" alt=\"5.png\" height=\"456\" src=\"https:\/\/imgconvert.csdnimg.cn\/aHR0cDovL3d3dy5jaGVuamlhbnF1LmNvbS9tZWRpYS91cGltZy81XzIwMTkwOTIxMjIyOTA5XzM5OS5wbmc?x-oss-process=image\/format,png\" width=\"406\">2.&nbsp;\u76f4\u63a5\u8ba1\u7b97Z<img loading=\"lazy\" alt=\"6.png\" height=\"216\" src=\"https:\/\/imgconvert.csdnimg.cn\/aHR0cDovL3d3dy5jaGVuamlhbnF1LmNvbS9tZWRpYS91cGltZy82XzIwMTkwOTIxMjIyOTQwXzM1Ny5wbmc?x-oss-process=image\/format,png\" width=\"513\">&nbsp;<\/li><\/ol>\n\n\n\n<p><strong>Self Attention\u4ee3\u7801\u5b9e\u73b0<\/strong><\/p>\n\n\n\n<p>&nbsp;&nbsp;&nbsp;&nbsp;\u4f7f\u7528Keras\u81ea\u5b9a\u4e49self attention\u5c42\uff0c\u4ee3\u7801\u5982\u4e0b\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>from keras import initializersfrom keras import activationsfrom keras import backend as Kfrom keras.engine.topology import Layer class MySelfAttention(Layer):        def __init__(self,output_dim,kernel_initializer='glorot_uniform',**kwargs):        self.output_dim=output_dim        self.kernel_initializer = initializers.get(kernel_initializer)        super(MySelfAttention,self).__init__(**kwargs)            def build(self,input_shape):        self.W=self.add_weight(name='W',             shape=(3,input_shape&#91;2],self.output_dim),             initializer=self.kernel_initializer,             trainable=True)        self.built = True            def call(self,x):        q=K.dot(x,self.W&#91;0])        k=K.dot(x,self.W&#91;1])        v=K.dot(x,self.W&#91;2])        <em>#print('q_shape:'+str(q.shape))<\/em>        e=K.batch_dot(q,K.permute_dimensions(k,&#91;0,2,1]))<em>#\u628ak\u8f6c\u7f6e\uff0c\u5e76\u4e0eq\u70b9\u4e58<\/em>        e=e\/(self.output_dim**0.5)        e=K.softmax(e)        o=K.batch_dot(e,v)        return o            def compute_output_shape(self,input_shape):        return (input_shape&#91;0],input_shape&#91;1],self.output_dim)<\/code><\/pre>\n\n\n\n<p><strong>Multi-Head Attention\u539f\u7406<\/strong><\/p>\n\n\n\n<p>&nbsp;&nbsp;&nbsp;&nbsp;\u4e0d\u540c\u7684\u968f\u673a\u521d\u59cb\u5316\u6620\u5c04\u77e9\u9635Wq,Wk,Wv\u53ef\u4ee5\u5c06\u8f93\u5165\u5411\u91cf\u6620\u5c04\u5230\u4e0d\u540c\u7684\u5b50\u7a7a\u95f4\uff0c\u8fd9\u53ef\u4ee5\u8ba9\u6a21\u578b\u4ece\u4e0d\u540c\u89d2\u5ea6\u7406\u89e3\u8f93\u5165\u7684\u5e8f\u5217\u3002\u56e0\u6b64\u540c\u65f6\u51e0\u4e2aAttention\u7684\u7ec4\u5408\u6548\u679c\u53ef\u80fd\u4f1a\u4f18\u4e8e\u5355\u4e2aAttenion\uff0c\u8fd9\u79cd\u540c\u65f6\u8ba1\u7b97\u591a\u4e2aAttention\u7684\u65b9\u6cd5\u88ab\u79f0\u4e3aMulti-Head Attention\uff0c\u6216\u8005\u591a\u5934\u6ce8\u610f\u529b\u3002<\/p>\n\n\n\n<p>&nbsp;&nbsp;&nbsp;&nbsp;\u6bcf\u4e2a\u201cHead\u201d\u90fd\u4f1a\u4ea7\u751f\u4e00\u4e2a\u8f93\u51fa\u5411\u91cfz\uff0c\u4f46\u662f\u6211\u4eec\u4e00\u822c\u53ea\u9700\u8981\u4e00\u4e2a\uff0c\u56e0\u6b64\u8fd8\u9700\u8981\u4e00\u4e2a\u77e9\u9635\u628a\u591a\u4e2a\u5408\u5e76\u7684\u6ce8\u610f\u529b\u5411\u91cf\u6620\u5c04\u4e3a\u5355\u4e2a\u5411\u91cf\u3002\u56fe\u793a\u5982\u4e0b\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img src=\"https:\/\/imgconvert.csdnimg.cn\/aHR0cDovL3d3dy5jaGVuamlhbnF1LmNvbS9tZWRpYS91cGltZy83XzIwMTkwOTIxMjI0ODU2XzY0NC5wbmc?x-oss-process=image\/format,png\" alt=\"7.png\"\/><\/figure>\n\n\n\n<p><strong>Multi-Head Attention\u4ee3\u7801\u5b9e\u73b0<\/strong><\/p>\n\n\n\n<p>&nbsp;&nbsp;&nbsp;&nbsp;\u8fd8\u662f\u4f7f\u7528Keras\u5b9e\u73b0multi-head attention\uff0c\u4ee3\u7801\u5982\u4e0b\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>from keras import initializersfrom keras import activationsfrom keras import backend as Kfrom keras.engine.topology import Layer  class MyMultiHeadAttention(Layer):    def __init__(self,output_dim,num_head,kernel_initializer='glorot_uniform',**kwargs):        self.output_dim=output_dim        self.num_head=num_head        self.kernel_initializer = initializers.get(kernel_initializer)        super(MyMultiHeadAttention,self).__init__(**kwargs)            def build(self,input_shape):        self.W=self.add_weight(name='W',           shape=(self.num_head,3,input_shape&#91;2],self.output_dim),           initializer=self.kernel_initializer,           trainable=True)        self.Wo=self.add_weight(name='Wo',           shape=(self.num_head*self.output_dim,self.output_dim),           initializer=self.kernel_initializer,           trainable=True)        self.built = True            def call(self,x):        q=K.dot(x,self.W&#91;0,0])        k=K.dot(x,self.W&#91;0,1])        v=K.dot(x,self.W&#91;0,2])        e=K.batch_dot(q,K.permute_dimensions(k,&#91;0,2,1]))<em>#\u628ak\u8f6c\u7f6e\uff0c\u5e76\u4e0eq\u70b9\u4e58<\/em>        e=e\/(self.output_dim**0.5)        e=K.softmax(e)        outputs=K.batch_dot(e,v)        for i in range(1,self.W.shape&#91;0]):            q=K.dot(x,self.W&#91;i,0])            k=K.dot(x,self.W&#91;i,1])            v=K.dot(x,self.W&#91;i,2])            <em>#print('q_shape:'+str(q.shape))<\/em>            e=K.batch_dot(q,K.permute_dimensions(k,&#91;0,2,1]))<em>#\u628ak\u8f6c\u7f6e\uff0c\u5e76\u4e0eq\u70b9\u4e58<\/em>            e=e\/(self.output_dim**0.5)            e=K.softmax(e)            <em>#print('e_shape:'+str(e.shape))<\/em>            o=K.batch_dot(e,v)            outputs=K.concatenate(&#91;outputs,o])        z=K.dot(outputs,self.Wo)        return z            def compute_output_shape(self,input_shape):        return (input_shape&#91;0],input_shape&#91;1],self.output_dim)<\/code><\/pre>\n","protected":false},"excerpt":{"rendered":"<p>Self Attention\u539f\u7406 &nbsp;&nbsp;&nbsp;&nbsp;self attention &hellip; <a href=\"http:\/\/139.9.1.231\/index.php\/2022\/04\/09\/attentionmulti-head-attentio\/\" class=\"more-link\">\u7ee7\u7eed\u9605\u8bfb<span class=\"screen-reader-text\">Self Attention\u548cMulti-Head Attention\u7684\u539f\u7406<\/span><\/a><\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":[],"categories":[4],"tags":[],"_links":{"self":[{"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/posts\/3668"}],"collection":[{"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/comments?post=3668"}],"version-history":[{"count":1,"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/posts\/3668\/revisions"}],"predecessor-version":[{"id":3669,"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/posts\/3668\/revisions\/3669"}],"wp:attachment":[{"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/media?parent=3668"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/categories?post=3668"},{"taxonomy":"post_tag","embeddable":true,"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/tags?post=3668"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}