{"id":6204,"date":"2022-08-24T10:49:25","date_gmt":"2022-08-24T02:49:25","guid":{"rendered":"http:\/\/139.9.1.231\/?p=6204"},"modified":"2022-08-30T20:38:38","modified_gmt":"2022-08-30T12:38:38","slug":"textclear","status":"publish","type":"post","link":"http:\/\/139.9.1.231\/index.php\/2022\/08\/24\/textclear\/","title":{"rendered":"\u4e2d\u6587\u6587\u672c\u6e05\u6d17\u4e0e\u7279\u5f81\u63d0\u53d6"},"content":{"rendered":"\n<p>\u6458\u81ea\u77e5\u4e4e\uff1a<\/p>\n\n\n\n<p><a rel=\"noreferrer noopener\" href=\"https:\/\/www.zhihu.com\/people\/xiao-ma-ge-22-58\" target=\"_blank\">bookname<\/a>\u5d4c\u5165\u5f0fAI\u7b97\u6cd5\u7814\u7a76<\/p>\n\n\n\n<h2>\u4e2d\u6587\u6587\u672c\u6e05\u6d17<\/h2>\n\n\n\n<p>\u4e2d\u6587\u6587\u672c\u6e05\u6d17\uff1a <\/p>\n\n\n\n<p>&#8211; \u53bb\u9664\u6307\u5b9a\u65e0\u7528\u7684\u7b26\u53f7<\/p>\n\n\n\n<p> &#8211; \u8ba9\u6587\u672c\u53ea\u4fdd\u7559\u6c49\u5b57<\/p>\n\n\n\n<p> &#8211; \u6587\u672c\u4e2d\u7684\u8868\u60c5\u7b26\u53f7\u53bb\u9664 <\/p>\n\n\n\n<p>&#8211; \u7e41\u4f53\u4e2d\u6587\u4e0e\u7b80\u4f53\u4e2d\u6587\u8f6c\u6362<\/p>\n\n\n\n<h3>\u4e2d\u6587\u6587\u672c\u6e05\u6d17\u7c7b<\/h3>\n\n\n\n<pre class=\"wp-block-code\"><code>import re\nfrom opencc import OpenCC\nfrom bs4 import BeautifulSoup\nimport jieba\nfrom glob import glob\n\nimport torch\nfrom tqdm.auto import tqdm\n\nimport sys\n!ls ..\/package\/\nsys.path.insert(0, \"..\/package\/\")\nfrom ltp import LTP\nnlp = LTP(path=\"base\")\n\nclass TextCleaner:\n    '''\n        \u6279\u91cf\u6e05\u6d17\u6570\u636e\n    '''\n    def __init__(self,\n                 remove_space=True, <em># \u53bb\u9664\u7a7a\u683c<\/em>\n                 remove_suspension=True, <em># \u8f6c\u6362\u7701\u7565\u53f7<\/em>\n                 only_zh=False, <em># \u53ea\u4fdd\u7559\u6c49\u5b50<\/em>\n                 remove_sentiment_character=True, <em># \u53bb\u9664\u8868\u60c5\u7b26\u53f7<\/em>\n                 to_simple=True, <em># \u8f6c\u5316\u4e3a\u7b80\u4f53\u4e2d\u6587<\/em>\n                 remove_html_label=True,\n                 remove_stop_words=False,\n                 stop_words_dir=\".\/\u505c\u7528\u8bcd\/\",\n                 with_space=False,\n                 batch_size=256):\n        self._remove_space = remove_space\n        self._remove_suspension = remove_suspension\n        self._remove_sentiment_character = remove_sentiment_character\n\n        self._only_zh = only_zh\n        self._to_simple = to_simple\n\n        self._remove_html_label = remove_html_label\n        self._remove_stop_words = remove_stop_words\n        self._stop_words_dir = stop_words_dir\n\n        self._with_space = with_space\n        self._batch_size = batch_size\n\n    def clean_single_text(self, text):\n        if self._remove_space:\n            text = self.remove_space(text)\n        if self._remove_suspension:\n            text = self.remove_suspension(text)\n        if self._remove_sentiment_character:\n            text = self.remove_sentiment_character(text)\n        if self._to_simple:\n            text = self.to_simple(text)\n        if self._only_zh:\n            text = self.get_zh_only(text)\n        if self._remove_html_label:\n            text = self.remove_html(text)\n        return text\n\n    def clean_text(self, text_list):\n        text_list = &#91;self.clean_single_text(text) for text in tqdm(text_list)]\n        tokenized_words_list = self.tokenizer_batch_text(text_list)\n        if self._remove_stop_words:\n            text_list = &#91;self.remove_stop_words(words_list, self._stop_words_dir, self._with_space) for words_list in tokenized_words_list]\n        return text_list\n\n    def remove_space(self, text):     <em>#\u5b9a\u4e49\u51fd\u6570<\/em>\n        return text.replace(' ','')   <em># \u53bb\u6389\u6587\u672c\u4e2d\u7684\u7a7a\u683c<\/em>\n\n    def remove_suspension(self, text):\n        return text.replace('...', '\u3002')\n\n    def get_zh_only(self, text):\n        def is_chinese(uchar):\n            if uchar >= u'\\u4e00' and uchar &lt;= u'\\u9fa5':  <em># \u5224\u65ad\u4e00\u4e2auchar\u662f\u5426\u662f\u6c49\u5b57<\/em> \u4e2d\u6587\u5b57\u7b26\u7684\u7f16\u7801\u8303\u56f4 \\u4e00 - \\u9fff\uff0c\u53ea\u8981\u5728\u8fd9\u4e2a\u8303\u56f4\u5c31\u53ef\u4ee5\n                return True\n            else:\n                return False\n\n        content = ''\n        for i in text:\n            if is_chinese(i):\n                content = content+i\n        return content\n\n    def remove_sentiment_character(self, sentence):    \n        pattern = re.compile(\"&#91;^\\u4e00-\\u9fa5^,^.^!^\uff0c^\u3002^?^\uff1f^\uff01^a-z^A-Z^0-9]\")  <em>#\u53ea\u4fdd\u7559\u4e2d\u82f1\u6587\u3001\u6570\u5b57\u548c\u7b26\u53f7\uff0c\u53bb\u6389\u5176\u4ed6\u4e1c\u897f<\/em>\n        <em>#\u82e5\u53ea\u4fdd\u7559\u4e2d\u82f1\u6587\u548c\u6570\u5b57\uff0c\u5219\u66ff\u6362\u4e3a&#91;^\\u4e00-\\u9fa5^a-z^A-Z^0-9]<\/em>\n        line = re.sub(pattern,'',sentence)  <em>#\u628a\u6587\u672c\u4e2d\u5339\u914d\u5230\u7684\u5b57\u7b26\u66ff\u6362\u6210\u7a7a\u5b57\u7b26<\/em>\n        new_sentence=''.join(line.split())    <em>#\u53bb\u9664\u7a7a\u767d<\/em>\n        return new_sentence\n\n    def to_simple(self, sentence):\n        new_sentence = OpenCC('t2s').convert(sentence)   <em># \u7e41\u4f53\u8f6c\u4e3a\u7b80\u4f53<\/em>\n        return new_sentence\n\n    def to_tradition(self, sentence):\n        new_sentence = OpenCC('s2t').convert(sentence)   <em># \u7b80\u4f53\u8f6c\u4e3a\u7e41\u4f53<\/em>\n        return new_sentence\n\n    def remove_html(self, text):\n        return BeautifulSoup(text, 'html.parser').get_text() <em>#\u53bb\u6389html\u6807\u7b7e<\/em>\n\n    def tokenizer_batch_text(self, text_list):\n        tokenized_text = &#91;]\n        len_text = len(text_list)\n        with torch.no_grad():\n            steps = self._batch_size\n            for start_idx in tqdm(range(0, len_text, steps)):\n                if start_idx + steps > len_text:\n                    tokenized_text += nlp.seg(text_list&#91;start_idx:])&#91;0]\n                else:\n                    tokenized_text += nlp.seg(text_list&#91;start_idx:start_idx+steps])&#91;0]\n        return tokenized_text\n\n    def remove_stop_words(self, words_list, stop_words_dir, with_space=False):\n        \"\"\"\n        \u4e2d\u6587\u6570\u636e\u6e05\u6d17  stopwords_chineses.txt\u5b58\u653e\u5728\u535a\u5ba2\u56ed\u6587\u4ef6\u4e2d\n        :param text:\n        :return:\n        \"\"\"\n        stop_word_filepath_list = glob(stop_words_dir + \"\/*.txt\")\n        for stop_word_filepath in stop_word_filepath_list:\n            with open(stop_word_filepath) as fp:\n                stopwords = {}.fromkeys(&#91;line.rstrip() for line in fp]) <em>#\u52a0\u8f7d\u505c\u7528\u8bcd(\u4e2d\u6587)<\/em>\n        eng_stopwords = set(stopwords) <em>#\u53bb\u6389\u91cd\u590d\u7684\u8bcd<\/em>\n        words = &#91;w for w in words_list if w not in eng_stopwords] <em>#\u53bb\u9664\u6587\u672c\u4e2d\u7684\u505c\u7528\u8bcd<\/em>\n        if with_space:\n            return ' '.join(words)\n        else:\n            return ''.join(words)\nltp\n\n\nfile \/root\/.cache\/torch\/ltp\/8909177e47aa4daf900c569b86053ac68838d09da28c7bbeb42b8efcb08f56aa-edb9303f86310d4bcfd1ac0fa20a744c9a7e13ee515fe3cf88ad31921ed616b2-extracted\/config.json not found\nfile \/root\/.cache\/torch\/ltp\/8909177e47aa4daf900c569b86053ac68838d09da28c7bbeb42b8efcb08f56aa-edb9303f86310d4bcfd1ac0fa20a744c9a7e13ee515fe3cf88ad31921ed616b2-extracted\/config.json not found\ncleaner = TextCleaner(remove_stop_words=True, with_space=True)\ncontents = &#91;'   \u5927\u5bb6\u597d\uff0c \u6b22\u8fce\u4e00\u8d77\u6765\u5b66\u4e60\u6587\u672c\u7684\u7a7a\u683c   \u53bb\u9664   \uff01', '   \u5927\u5bb6\u597d\uff0c\u6587\u672c\u7684\u7a7a\u683c   \u53bb\u9664   \uff01']\nresults = cleaner.clean_text(contents)\nprint(results)\n0%|          | 0\/2 &#91;00:00&lt;?, ?it\/s]\n\n\n\n  0%|          | 0\/1 &#91;00:00&lt;?, ?it\/s]\n\n\n&#91;'\u597d \uff0c \u5b66\u4e60 \u6587\u672c \u7a7a\u683c \u53bb\u9664 \uff01', '\u597d \uff0c \u6587\u672c \u7a7a\u683c \u53bb\u9664 \uff01']<\/code><\/pre>\n\n\n\n<h3>\u53bb\u9664\u7a7a\u683c<\/h3>\n\n\n\n<pre class=\"wp-block-code\"><code><em># \u53bb\u9664\u7a7a\u683c<\/em>\ncontents = '   \u5927\u5bb6\u597d\uff0c \u6b22\u8fce\u4e00\u8d77\u6765\u5b66\u4e60\u6587\u672c\u7684\u7a7a\u683c   \u53bb\u9664   \uff01'\nprint('\u5904\u7406\u524d\u6587\u672c\uff1a'+contents)\ndef process(our_data):     <em>#\u5b9a\u4e49\u51fd\u6570<\/em>\n    content = our_data.replace(' ','')   <em># \u53bb\u6389\u6587\u672c\u4e2d\u7684\u7a7a\u683c<\/em>\n    print('\u5904\u7406\u540e\u6587\u672c\uff1a'+content)\nprocess(contents)\n\u5904\u7406\u524d\u6587\u672c\uff1a   \u5927\u5bb6\u597d\uff0c \u6b22\u8fce\u4e00\u8d77\u6765\u5b66\u4e60\u6587\u672c\u7684\u7a7a\u683c   \u53bb\u9664   \uff01\n\u5904\u7406\u540e\u6587\u672c\uff1a\u5927\u5bb6\u597d\uff0c\u6b22\u8fce\u4e00\u8d77\u6765\u5b66\u4e60\u6587\u672c\u7684\u7a7a\u683c\u53bb\u9664\uff01<\/code><\/pre>\n\n\n\n<h3>\u53bb\u9664\u7a7a\u683c\u7684\u540c\u65f6\u628a\u7701\u7565\u53f7\u8f6c\u6362\u4e3a\u53e5\u53f7<\/h3>\n\n\n\n<pre class=\"wp-block-code\"><code><em># \u53bb\u9664\u7a7a\u683c\u7684\u540c\u65f6\u628a\u7701\u7565\u53f7\u8f6c\u6362\u4e3a\u53e5\u53f7<\/em>\ncontents = '   \u5927\u5bb6\u597d\uff0c \u8fd9\u91cc\u8fd8\u6709  \u5f88\u591a\u7684\u77e5\u8bc6...\u4e00\u8d77\u62c9\u5b66\u4e60\u5427 \uff01'\nprint('\u5904\u7406\u524d\u6587\u672c\uff1a'+contents)\ndef process(data):     <em>#\u5b9a\u4e49\u51fd\u6570<\/em>\n    content1 = data.replace(' ','')    <em># \u53bb\u6389\u6587\u672c\u4e2d\u7684\u7a7a\u683c<\/em>\n    content2 = content1.replace('...','\u3002')    <em># \u53bb\u6389\u6587\u672c\u4e2d\u7684\u7a7a\u683c<\/em>\n    print('\u5904\u7406\u540e\u6587\u672c\uff1a'+ content2)\nprocess(contents)\n\u5904\u7406\u524d\u6587\u672c\uff1a   \u5927\u5bb6\u597d\uff0c \u8fd9\u91cc\u8fd8\u6709  \u5f88\u591a\u7684\u77e5\u8bc6...\u4e00\u8d77\u62c9\u5b66\u4e60\u5427 \uff01\n\u5904\u7406\u540e\u6587\u672c\uff1a\u5927\u5bb6\u597d\uff0c\u8fd9\u91cc\u8fd8\u6709\u5f88\u591a\u7684\u77e5\u8bc6\u3002\u4e00\u8d77\u62c9\u5b66\u4e60\u5427\uff01<\/code><\/pre>\n\n\n\n<h3>\u8ba9\u6587\u672c\u53ea\u4fdd\u7559\u6c49\u5b57<\/h3>\n\n\n\n<pre class=\"wp-block-code\"><code>def is_chinese(uchar):\n    if uchar &gt;= u'\\u4e00' and uchar &lt;= u'\\u9fa5':  <em># \u5224\u65ad\u4e00\u4e2auchar\u662f\u5426\u662f\u6c49\u5b57<\/em>\n        return True\n    else:\n        return False\n\ndef allcontents(contents):\n    content = ''\n    for i in contents:\n        if is_chinese(i):\n            content = content+i\n    print('\\n\u5904\u7406\u540e\u7684\u53e5\u5b50\u4e3a:\\n'+content)\n\ncentents = '1,2,3...\u6211\u4eec\u5f00\u59cb\u5427\uff0c \u52a0\u6cb9\uff01'\nprint('\u539f\u53e5\u5b50\u4e3a:\\n'+centents)\nallcontents(centents)\n\u539f\u53e5\u5b50\u4e3a:\n1,2,3...\u6211\u4eec\u5f00\u59cb\u5427\uff0c \u52a0\u6cb9\uff01\n\n\u5904\u7406\u540e\u7684\u53e5\u5b50\u4e3a:\n\u6211\u4eec\u5f00\u59cb\u5427\u52a0\u6cb9<\/code><\/pre>\n\n\n\n<h3>\u6587\u672c\u4e2d\u7684\u8868\u60c5\u7b26\u53f7\u53bb\u9664<\/h3>\n\n\n\n<pre class=\"wp-block-code\"><code>import re\nsentence='\u73b0\u5728\u542c\u7740\u97f3\u4e50,duo rui mi,\u5f88\u5f00\u5fc3*_*'\nprint('\u539f\u53e5\u5b50\u4e3a:\\n'+sentence)\n\ndef clear_character(sentence):    \n    pattern = re.compile(\"&#91;^\\u4e00-\\u9fa5^,^.^!^a-z^A-Z^0-9]\")  <em>#\u53ea\u4fdd\u7559\u4e2d\u82f1\u6587\u3001\u6570\u5b57\u548c\u7b26\u53f7\uff0c\u53bb\u6389\u5176\u4ed6\u4e1c\u897f<\/em>\n    <em>#\u82e5\u53ea\u4fdd\u7559\u4e2d\u82f1\u6587\u548c\u6570\u5b57\uff0c\u5219\u66ff\u6362\u4e3a&#91;^\\u4e00-\\u9fa5^a-z^A-Z^0-9]<\/em>\n    line=re.sub(pattern,'',sentence)  <em>#\u628a\u6587\u672c\u4e2d\u5339\u914d\u5230\u7684\u5b57\u7b26\u66ff\u6362\u6210\u7a7a\u5b57\u7b26<\/em>\n    new_sentence=''.join(line.split())    <em>#\u53bb\u9664\u7a7a\u767d<\/em>\n    print('\\n\u5904\u7406\u540e\u7684\u53e5\u5b50\u4e3a:\\n'+new_sentence) \n\nclear_character(sentence)\n\u539f\u53e5\u5b50\u4e3a:\n\u73b0\u5728\u542c\u7740\u97f3\u4e50,duo rui mi,\u5f88\u5f00\u5fc3*_*\n\n\u5904\u7406\u540e\u7684\u53e5\u5b50\u4e3a:\n\u73b0\u5728\u542c\u7740\u97f3\u4e50,duoruimi,\u5f88\u5f00\u5fc3<\/code><\/pre>\n\n\n\n<h3>\u7e41\u4f53\u4e2d\u6587\u4e0e\u7b80\u4f53\u4e2d\u6587\u8f6c\u6362<\/h3>\n\n\n\n<pre class=\"wp-block-code\"><code>from opencc import OpenCC\n\nsentence = '\u4f60\u73b0\u5728\u8bfb\u7684\u8fd9\u91cc\u662f\u7b80\u4f53\uff0c\u8fd9\u91cc\u662f\u7e41\u4f53\uff0c\u80fd\u770b\u61c2\u5417\uff1f'\nprint('\u539f\u53e5\u5b50\u4e3a:\\n'+sentence)\n\ndef Simplified(sentence):\n    new_sentence = OpenCC('t2s').convert(sentence)   <em># \u7e41\u4f53\u8f6c\u4e3a\u7b80\u4f53<\/em>\n    print('\\n\u5904\u7406\u540e\u7684\u53e5\u5b50\u4e3a:\\n'+new_sentence)\n\ndef Traditional(sentence):\n    new_sentence = OpenCC('s2t').convert(sentence)   <em># \u7b80\u4f53\u8f6c\u4e3a\u7e41\u4f53<\/em>\n    print('\\n\u5904\u7406\u540e\u7684\u53e5\u5b50\u4e3a:\\n'+new_sentence) \n\nSimplified(sentence)\nTraditional(sentence)\n\u539f\u53e5\u5b50\u4e3a:\n\u4f60\u73b0\u5728\u8bfb\u7684\u8fd9\u91cc\u662f\u7b80\u4f53\uff0c\u8fd9\u91cc\u662f\u7e41\u4f53\uff0c\u80fd\u770b\u61c2\u5417\uff1f\n\n\u5904\u7406\u540e\u7684\u53e5\u5b50\u4e3a:\n\u4f60\u73b0\u5728\u8bfb\u7684\u8fd9\u91cc\u662f\u7b80\u4f53\uff0c\u8fd9\u91cc\u662f\u7e41\u4f53\uff0c\u80fd\u770b\u61c2\u5417\uff1f\n\n\u5904\u7406\u540e\u7684\u53e5\u5b50\u4e3a:\n\u4f60\u73b0\u5728\u8bfb\u7684\u8fd9\u91cc\u662f\u7b80\u4f53\uff0c\u8fd9\u91cc\u662f\u7e41\u4f53\uff0c\u80fd\u770b\u61c2\u5417\uff1f<\/code><\/pre>\n\n\n\n<p><code>OpenCC<\/code>\u7684\u53c2\u6570\u8bbe\u7f6e\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>- hk2s: Traditional Chinese (Hong Kong standard) to Simplified Chinese\n- s2hk: Simplified Chinese to Traditional Chinese (Hong Kong standard)\n- s2t: Simplified Chinese to Traditional Chinese\n- s2tw: Simplified Chinese to Traditional Chinese (Taiwan standard)\n- s2twp: Simplified Chinese to Traditional Chinese (Taiwan standard, with phrases)\n- t2hk: Traditional Chinese to Traditional Chinese (Hong Kong standard)\n- t2s: Traditional Chinese to Simplified Chinese\n- t2tw: Traditional Chinese to Traditional Chinese (Taiwan standard)\n- tw2s: Traditional Chinese (Taiwan standard) to Simplified Chinese\n- tw2sp: Traditional Chinese (Taiwan standard) to Simplified Chinese (with phrases)<\/code><\/pre>\n\n\n\n<h3>\u53bb\u9664html\u6807\u7b7e\u548c\u505c\u7528\u8bcd<\/h3>\n\n\n\n<pre class=\"wp-block-code\"><code>from bs4 import BeautifulSoup\nimport jieba\nfrom glob import glob\n\ndef clean_chineses_text(text, with_space=False):\n    \"\"\"\n    \u4e2d\u6587\u6570\u636e\u6e05\u6d17  stopwords_chineses.txt\u5b58\u653e\u5728\u535a\u5ba2\u56ed\u6587\u4ef6\u4e2d\n    :param text:\n    :return:\n    \"\"\"\n    text = BeautifulSoup(text, 'html.parser').get_text() <em>#\u53bb\u6389html\u6807\u7b7e<\/em>\n    text = jieba.lcut(text)\n    stop_word_filepath_list = glob(\".\/\u505c\u7528\u8bcd\/*.txt\")\n<em>#     print(stop_word_filepath_list)<\/em>\n    for stop_word_filepath in stop_word_filepath_list:\n        with open(stop_word_filepath) as fp:\n            stopwords = {}.fromkeys(&#91;line.rstrip() for line in fp]) <em>#\u52a0\u8f7d\u505c\u7528\u8bcd(\u4e2d\u6587)<\/em>\n    eng_stopwords = set(stopwords) <em>#\u53bb\u6389\u91cd\u590d\u7684\u8bcd<\/em>\n    words = &#91;w for w in text if w not in eng_stopwords] <em>#\u53bb\u9664\u6587\u672c\u4e2d\u7684\u505c\u7528\u8bcd<\/em>\n    if with_space:\n        return ' '.join(words)\n    else:\n        return ''.join(words)\nclean_chineses_text(\"\u4f60\u73b0\u5728\u8bfb\u7684\u8fd9\u91cc\u662f\u7b80\u4f53\uff0c\u8fd9\u91cc\u662f\u7e41\u4f53\uff0c\u80fd\u770b\u61c2\u5417\uff1f\", with_space=True)\nBuilding prefix dict from the default dictionary ...\nLoading model from cache \/tmp\/jieba.cache\nLoading model cost 0.703 seconds.\nPrefix dict has been built successfully.\n\n\n\n\n\n'\u8bfb \u7b80\u4f53 \uff0c \u8fd9\u91cc \u7e41\u4f53 \uff0c \u80fd\u770b\u61c2 \u5417 \uff1f'\nENGLISH_STOP_WORDS = frozenset(&#91;\n    \"about\", \"above\", \"across\", \"after\", \"afterwards\", \"again\", \"against\",\n    \"all\", \"almost\", \"alone\", \"along\", \"already\", \"also\", \"although\", \"always\",\n    \"am\", \"among\", \"amongst\", \"amoungst\", \"amount\", \"an\", \"and\", \"another\",\n    \"any\", \"anyhow\", \"anyone\", \"anything\", \"anyway\", \"anywhere\", \"are\",\n    \"around\", \"as\", \"at\", \"back\", \"be\", \"became\", \"because\", \"become\",\n    \"becomes\", \"becoming\", \"been\", \"before\", \"beforehand\", \"behind\", \"being\",\n    \"below\", \"beside\", \"besides\", \"between\", \"beyond\", \"bill\", \"both\",\n    \"bottom\", \"but\", \"by\", \"call\", \"can\", \"cannot\", \"cant\", \"co\", \"con\",\n    \"could\", \"couldnt\", \"cry\", \"de\", \"describe\", \"detail\", \"do\", \"done\",\n    \"down\", \"due\", \"during\", \"each\", \"eg\", \"eight\", \"either\", \"eleven\", \"else\",\n    \"elsewhere\", \"empty\", \"enough\", \"etc\", \"even\", \"ever\", \"every\", \"everyone\",\n    \"everything\", \"everywhere\", \"except\", \"few\", \"fifteen\", \"fifty\", \"fill\",\n    \"find\", \"fire\", \"first\", \"five\", \"for\", \"former\", \"formerly\", \"forty\",\n    \"found\", \"four\", \"from\", \"front\", \"full\", \"further\", \"get\", \"give\", \"go\",\n    \"had\", \"has\", \"hasnt\", \"have\", \"he\", \"hence\", \"her\", \"here\", \"hereafter\",\n    \"hereby\", \"herein\", \"hereupon\", \"hers\", \"herself\", \"him\", \"himself\", \"his\",\n    \"how\", \"however\", \"hundred\", \"ie\", \"if\", \"in\", \"inc\", \"indeed\",\n    \"interest\", \"into\", \"is\", \"it\", \"its\", \"itself\", \"keep\", \"last\", \"latter\",\n    \"latterly\", \"least\", \"less\", \"ltd\", \"made\", \"many\", \"may\", \"me\",\n    \"meanwhile\", \"might\", \"mill\", \"mine\", \"more\", \"moreover\", \"most\", \"mostly\",\n    \"move\", \"much\", \"must\", \"my\", \"myself\", \"name\", \"namely\", \"neither\",\n    \"never\", \"nevertheless\", \"next\", \"nine\", \"no\", \"nobody\", \"none\", \"noone\",\n    \"nor\", \"not\", \"nothing\", \"now\", \"nowhere\", \"of\", \"off\", \"often\", \"on\",\n    \"once\", \"one\", \"only\", \"onto\", \"or\", \"other\", \"others\", \"otherwise\", \"our\",\n    \"ours\", \"ourselves\", \"out\", \"over\", \"own\", \"part\", \"per\", \"perhaps\",\n    \"please\", \"put\", \"rather\", \"re\", \"same\", \"see\", \"seem\", \"seemed\",\n    \"seeming\", \"seems\", \"serious\", \"several\", \"she\", \"should\", \"show\", \"side\",\n    \"since\", \"sincere\", \"six\", \"sixty\", \"so\", \"some\", \"somehow\", \"someone\",\n    \"something\", \"sometime\", \"sometimes\", \"somewhere\", \"still\", \"such\",\n    \"system\", \"take\", \"ten\", \"than\", \"that\", \"the\", \"their\", \"them\",\n    \"themselves\", \"then\", \"thence\", \"there\", \"thereafter\", \"thereby\",\n    \"therefore\", \"therein\", \"thereupon\", \"these\", \"they\", \"thick\", \"thin\",\n    \"third\", \"this\", \"those\", \"though\", \"three\", \"through\", \"throughout\",\n    \"thru\", \"thus\", \"to\", \"together\", \"too\", \"top\", \"toward\", \"towards\",\n    \"twelve\", \"twenty\", \"two\", \"un\", \"under\", \"until\", \"up\", \"upon\", \"us\",\n    \"very\", \"via\", \"was\", \"we\", \"well\", \"were\", \"what\", \"whatever\", \"when\",\n    \"whence\", \"whenever\", \"where\", \"whereafter\", \"whereas\", \"whereby\",\n    \"wherein\", \"whereupon\", \"wherever\", \"whether\", \"which\", \"while\", \"whither\",\n    \"who\", \"whoever\", \"whole\", \"whom\", \"whose\", \"why\", \"will\", \"with\",\n    \"within\", \"without\", \"would\", \"yet\", \"you\", \"your\", \"yours\", \"yourself\",\n    \"yourselves\", \"a\", \"b\", \"c\", \"d\", \"e\", \"f\", \"g\", \"h\", \"i\", \"j\", \"k\", \"l\",\n    \"m\", \"n\", \"o\", \"p\", \"q\", \"r\", \"s\", \"t\", \"u\", \"v\", \"w\", \"x\", \"y\", \"z\"])<\/code><\/pre>\n\n\n\n<h2>\u7279\u5f81\u62bd\u53d6<\/h2>\n\n\n\n<ul><li>BOW<\/li><li>TF-IDF<\/li><li>LDA<\/li><\/ul>\n\n\n\n<h3>\u6587\u672c\u7279\u5f81\u63d0\u53d6\u7c7b<\/h3>\n\n\n\n<pre class=\"wp-block-code\"><code>import numpy as np\nimport pandas as pd\nfrom tqdm.auto import tqdm\nfrom sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer\n\nimport sys\n!ls ..\/package\/\nsys.path.insert(0, \"..\/package\/\")\nfrom ltp import LTP\nnlp = LTP(path=\"base\")\n\nfrom gensim.models import Word2Vec\n\nclass TextFeatures:\n    def __init__(self, ngram_range=(1, 2)):\n        self.cvt = CountVectorizer(tokenizer=self.tokenizer, ngram_range=ngram_range)\n        self.tvt = TfidfVectorizer(tokenizer=self.tokenizer, ngram_range=ngram_range)\n        self.hvt = HashingVectorizer(tokenizer=self.tokenizer, ngram_range=ngram_range)\n        self.cleaner = TextCleaner(remove_html_label=True, remove_stop_words=True, with_space=True)\n\n    def clean_text(self, text_list):\n        return self.cleaner.clean_text(text_list)\n\n    def tokenizer(self, text):\n        return text.split(\" \")\n\n    def get_bow(self, text_list):\n        return self.cvt.fit_transform(text_list)\n\n    def get_tfidf(self, text_list):\n        return self.tvt.fit_transform(text_list)\n\n    def get_hashing(self, text_list):\n        return self.hvt.fit_transform(text_list)\nltp\n\n\nfile \/root\/.cache\/torch\/ltp\/8909177e47aa4daf900c569b86053ac68838d09da28c7bbeb42b8efcb08f56aa-edb9303f86310d4bcfd1ac0fa20a744c9a7e13ee515fe3cf88ad31921ed616b2-extracted\/config.json not found\nfile \/root\/.cache\/torch\/ltp\/8909177e47aa4daf900c569b86053ac68838d09da28c7bbeb42b8efcb08f56aa-edb9303f86310d4bcfd1ac0fa20a744c9a7e13ee515fe3cf88ad31921ed616b2-extracted\/config.json not found\ntrain_df = pd.read_csv(\"..\/0.\u6570\u636e\/1.\u60c5\u611f\u5206\u6790\/NLPCC14-SC\/train.tsv\", sep=\"\\t\", error_bad_lines=False)\ntrain_df.head()<\/code><\/pre>\n\n\n\n<figure class=\"wp-block-table\"><table><tbody><tr><th><\/th><th>label<\/th><th>text_a<\/th><\/tr><\/tbody><\/table><\/figure>\n\n\n\n<pre class=\"wp-block-code\"><code>set(train_df&#91;\"label\"]), train_df.shape\n({0, 1}, (10000, 2))\ncleaner = TextCleaner(remove_html_label=True, remove_stop_words=True, with_space=True)\ncontents = &#91;'   \u5927\u5bb6\u597d\uff0c \u6b22\u8fce\u4e00\u8d77\u6765\u5b66\u4e60\u6587\u672c\u7684\u7a7a\u683c   \u53bb\u9664   \uff01']\nresults = cleaner.clean_text(contents)\nprint(results)\n0%|          | 0\/1 &#91;00:00&lt;?, ?it\/s]\n\n\n\n  0%|          | 0\/1 &#91;00:00&lt;?, ?it\/s]\n\n\n&#91;'\u597d \uff0c \u5b66\u4e60 \u6587\u672c \u7a7a\u683c \u53bb\u9664 \uff01']\ntqdm.pandas(desc=\"clean data\")\ntrain_df&#91;\"cleaned_text\"] = cleaner.clean_text(train_df&#91;\"text_a\"].values)\n0%|          | 0\/10000 &#91;00:00&lt;?, ?it\/s]\n\n\n\n  0%|          | 0\/40 &#91;00:00&lt;?, ?it\/s]\ntrain_df.to_csv(\"cleaned_train.csv\", index=None)\n<em># import torch<\/em>\n<em># from tqdm.auto import tqdm<\/em>\n\n<em># tokenized_text = &#91;]<\/em>\n<em># text_list = list(train_df&#91;\"cleaned_text\"].values)<\/em>\n<em># with torch.no_grad():<\/em>\n<em>#     steps = 256<\/em>\n<em>#     for start_idx in tqdm(range(0, train_df.shape&#91;0], steps)):<\/em>\n<em># #         print(start_idx)<\/em>\n<em>#         if start_idx + steps &gt; train_df.shape&#91;0]:<\/em>\n<em>#             tokenized_text += nlp.seg(text_list&#91;start_idx:])&#91;0]<\/em>\n<em>#         else:<\/em>\n<em>#             tokenized_text += nlp.seg(text_list&#91;start_idx:start_idx+steps])&#91;0]<\/em>\n<em># from joblib import dump, load<\/em>\n<em># \u5173\u6389\u663e\u5b58\u5360\u7528<\/em>\n<em># from numba import cuda<\/em>\n\n<em># cuda.select_device(0)<\/em>\n<em># cuda.close()<\/em><\/code><\/pre>\n\n\n\n<h3>BOW<\/h3>\n\n\n\n<pre class=\"wp-block-code\"><code>!ls ..\/1.\u57fa\u7840\/\u505c\u7528\u8bcd\/\n\u4e2d\u6587\u505c\u7528\u8bcd\u5e93.txt  \u54c8\u5de5\u5927\u505c\u7528\u8bcd\u8868.txt  \u56db\u5ddd\u5927\u5b66\u505c\u7528\u8bcd\u8868.txt  \u767e\u5ea6\u505c\u7528\u8bcd\u8868.txt\nfrom glob import glob\n<em># \u505c\u7528\u8bcd\u5217\u8868<\/em>\nstop_words = &#91;]\ntxt_list = glob(\"..\/1.\u57fa\u7840\/\u505c\u7528\u8bcd\/*.txt\")\nfor txt_path in txt_list:\n    with open(txt_path, \"r\") as fp:\n        lines = fp.readlines()\n    stop_words += &#91;line.strip() for line in lines]\nlen(stop_words)\n3893\nfrom sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer\nfrom sklearn.linear_model import Ridge, Lasso, LinearRegression\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import accuracy_score, roc_auc_score\ndef tokenizer(text):\n    return text.split(\" \")\n<em># corpus = &#91;\" \".join(text_list) for text_list in tokenized_text]<\/em>\n<em># corpus&#91;:2]<\/em>\ncorpus = train_df&#91;\"cleaned_text\"].values\ncvt = CountVectorizer(stop_words=stop_words, tokenizer=tokenizer, ngram_range=(1, 2))\nx_cvt = cvt.fit_transform(corpus)\nlen(cvt.vocabulary_)\n137525\ny = train_df&#91;\"label\"].values\nX_train, X_val, y_train, y_val = train_test_split(x_cvt, y, test_size=0.1)\n\nclf = Ridge(alpha=500.)\nclf.fit(X_train, y_train)\n\nprint(\"train score: \")\ny_pred = clf.predict(X_train)\nprint(roc_auc_score(y_train, y_pred), accuracy_score(y_train, y_pred&gt;0.5))\nprint()\nprint(\"valid score: \")\ny_pred = clf.predict(X_val)\nprint(roc_auc_score(y_val, y_pred), accuracy_score(y_val, y_pred&gt;0.5))\ntrain score: \n0.8657380740314067 0.798\n\nvalid score: \n0.8009079767378523 0.733<\/code><\/pre>\n\n\n\n<h3>TFIDF<\/h3>\n\n\n\n<pre class=\"wp-block-code\"><code>from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer\ntvt = TfidfVectorizer(stop_words=stop_words, tokenizer=tokenizer, ngram_range=(1, 2))\nx_tvt = tvt.fit_transform(corpus)\nlen(tvt.vocabulary_)\n137525\ny = train_df&#91;\"label\"].values\nX_train, X_val, y_train, y_val = train_test_split(x_tvt, y, test_size=0.1)\n\nclf = Ridge(alpha=10.)\nclf.fit(X_train, y_train)\n\nprint(\"train score: \")\ny_pred = clf.predict(X_train)\nprint(roc_auc_score(y_train, y_pred), accuracy_score(y_train, y_pred&gt;0.5))\nprint()\nprint(\"valid score: \")\ny_pred = clf.predict(X_val)\nprint(roc_auc_score(y_val, y_pred), accuracy_score(y_val, y_pred&gt;0.5))\ntrain score: \n0.9349220324539836 0.8745555555555555\n\nvalid score: \n0.7963706773775423 0.728<\/code><\/pre>\n\n\n\n<h3>HashingVectorizer<\/h3>\n\n\n\n<pre class=\"wp-block-code\"><code>from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer\nhvt = HashingVectorizer(stop_words=stop_words, tokenizer=tokenizer, ngram_range=(1, 2))\nx_hvt = hvt.fit_transform(corpus)\ny = train_df&#91;\"label\"].values\nX_train, X_val, y_train, y_val = train_test_split(x_hvt, y, test_size=0.1)\n\nclf = Ridge(alpha=1.)\nclf.fit(X_train, y_train)\n\nprint(\"train score: \")\ny_pred = clf.predict(X_train)\nprint(roc_auc_score(y_train, y_pred), accuracy_score(y_train, y_pred&gt;0.5))\nprint()\nprint(\"valid score: \")\ny_pred = clf.predict(X_val)\nprint(roc_auc_score(y_val, y_pred), accuracy_score(y_val, y_pred&gt;0.5))\ntrain score: \n0.99204728016389 0.969\n\nvalid score: \n0.8349841394447204 0.749<\/code><\/pre>\n\n\n\n<h3>LDA<\/h3>\n\n\n\n<pre class=\"wp-block-code\"><code>train_df = pd.read_csv(\".\/cleaned_train.csv\")\ntrain_df.head()<\/code><\/pre>\n\n\n\n<figure class=\"wp-block-table\"><table><tbody><tr><th><\/th><th>label<\/th><th>text_a<\/th><th>cleaned_text<\/th><\/tr><\/tbody><\/table><\/figure>\n\n\n\n<pre class=\"wp-block-code\"><code>from glob import glob\n<em># \u505c\u7528\u8bcd\u5217\u8868<\/em>\nstop_words = &#91;]\ntxt_list = glob(\"..\/1.\u57fa\u7840\/\u505c\u7528\u8bcd\/*.txt\")\nfor txt_path in txt_list:\n    with open(txt_path, \"r\") as fp:\n        lines = fp.readlines()\n    stop_words += &#91;line.strip() for line in lines]\nlen(stop_words)\n3893\nfrom sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer\nfrom sklearn.decomposition import LatentDirichletAllocation\nfrom sklearn.linear_model import Ridge, Lasso, LinearRegression\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import accuracy_score, roc_auc_score\ndef tokenizer(text):\n    return text.split(\" \")\n\ncorpus = train_df&#91;\"cleaned_text\"].values\ncorpus = &#91;string if string is not np.nan else \"\" for string in corpus]\ncvt = CountVectorizer(tokenizer=tokenizer, ngram_range=(1, 2))\nx_cvt = cvt.fit_transform(corpus)\nlda = LatentDirichletAllocation(n_components=32, doc_topic_prior=None, topic_word_prior=None, learning_method='batch', \n                                learning_decay=0.7, learning_offset=50.0, max_iter=10, batch_size=128, evaluate_every=-1, \n                                total_samples=1000000.0, perp_tol=0.1, mean_change_tol=0.001, max_doc_update_iter=100, \n                                n_jobs=None, verbose=0, random_state=402)\ndocres = lda.fit_transform(x_cvt)\ndocres.shape\n(10000, 32)\ny = train_df&#91;\"label\"].values\nX_train, X_val, y_train, y_val = train_test_split(docres, y, test_size=0.1)\n\nclf = Ridge(alpha=500.)\nclf.fit(X_train, y_train)\n\nprint(\"train score: \")\ny_pred = clf.predict(X_train)\nprint(roc_auc_score(y_train, y_pred), accuracy_score(y_train, y_pred&gt;0.5))\nprint()\nprint(\"valid score: \")\ny_pred = clf.predict(X_val)\nprint(roc_auc_score(y_val, y_pred), accuracy_score(y_val, y_pred&gt;0.5))\ntrain score: \n0.5984059229289742 0.5741111111111111\n\nvalid score: \n0.5797141495568878 0.57<\/code><\/pre>\n\n\n\n<blockquote class=\"wp-block-quote\"><p>gensim<\/p><\/blockquote>\n\n\n\n<pre class=\"wp-block-code\"><code>corpus = &#91;string.split(\" \") for string in corpus]\nfrom gensim import corpora\ndictionary = corpora.Dictionary(corpus)\ndictionary.save('qzone.dict')\ndictionary.filter_extremes(no_below=20, no_above=0.5)\ndictionary.compactify()\ncorpus = &#91;dictionary.doc2bow(s) for s in corpus]\ncorpora.MmCorpus.serialize('corpus_bow.mm', corpus)  <em># \u5b58\u50a8\u8bed\u6599\u5e93<\/em>\nfrom gensim.models import LdaModel\n\nnum_topics = 100\nchunksize = 2000\npasses = 20\niterations = 400\neval_every = None \n\ntemp = dictionary&#91;0]\nid2word = dictionary.id2token\n\nmodel = LdaModel(\n    corpus=corpus,\n    id2word=id2word,\n    chunksize=chunksize,\n    alpha='auto',\n    eta='auto',\n    iterations=iterations,\n    num_topics=num_topics,\n    passes=passes,\n    eval_every=eval_every\n)\n\nmodel.save('qzone.model')\ntop_topics = model.top_topics(corpus)\navg_topic_coherence = sum(&#91;t&#91;1] for t in top_topics]) \/ num_topics\nprint('Average topic coherence: %.4f.' % avg_topic_coherence)\nAverage topic coherence: -5.7200.\nlen(top_topics), len(corpus)\n(100, 10000)<\/code><\/pre>\n\n\n\n<h3>LTP\u7279\u5f81\u63d0\u53d6<\/h3>\n\n\n\n<pre class=\"wp-block-code\"><code>import sys\n!ls ..\/package\/\n\nsys.path.insert(0, \"..\/package\/\")\n\nfrom ltp import LTP\nnlp = LTP(path=\"base\")\nltp\n\n\nfile \/root\/.cache\/torch\/ltp\/8909177e47aa4daf900c569b86053ac68838d09da28c7bbeb42b8efcb08f56aa-edb9303f86310d4bcfd1ac0fa20a744c9a7e13ee515fe3cf88ad31921ed616b2-extracted\/config.json not found\nfile \/root\/.cache\/torch\/ltp\/8909177e47aa4daf900c569b86053ac68838d09da28c7bbeb42b8efcb08f56aa-edb9303f86310d4bcfd1ac0fa20a744c9a7e13ee515fe3cf88ad31921ed616b2-extracted\/config.json not found\nseg, hidden = nlp.seg(&#91;\"\u4ed6\u53eb\u6c64\u59c6\u53bb\u62ff\u5916\u8863\u3002\"])\npos = nlp.pos(hidden)\nner = nlp.ner(hidden)\nsrl = nlp.srl(hidden)\ndep = nlp.dep(hidden)\nsdp = nlp.sdp(hidden)<\/code><\/pre>\n\n\n\n<blockquote class=\"wp-block-quote\"><p>\u5bf9\u4e8eLTP\u63d0\u53d6\u7684\u7279\u5f81\uff0c\u53ef\u4ee5\u53c2\u8003LTP\u7684\u6587\u6863<\/p><\/blockquote>\n\n\n\n<ul><li>\u9759\u6001\u8bcd\u5411\u91cf<\/li><li>\u52a8\u6001\u8bcd\u5411\u91cf<\/li><\/ul>\n","protected":false},"excerpt":{"rendered":"<p>\u6458\u81ea\u77e5\u4e4e\uff1a bookname\u5d4c\u5165\u5f0fAI\u7b97\u6cd5\u7814\u7a76 \u4e2d\u6587\u6587\u672c\u6e05\u6d17 \u4e2d\u6587\u6587\u672c\u6e05\u6d17\uff1a &#8211; \u53bb\u9664\u6307\u5b9a\u65e0\u7528\u7684\u7b26 &hellip; <a href=\"http:\/\/139.9.1.231\/index.php\/2022\/08\/24\/textclear\/\" class=\"more-link\">\u7ee7\u7eed\u9605\u8bfb<span class=\"screen-reader-text\">\u4e2d\u6587\u6587\u672c\u6e05\u6d17\u4e0e\u7279\u5f81\u63d0\u53d6<\/span><\/a><\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":[],"categories":[8,11,21],"tags":[],"_links":{"self":[{"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/posts\/6204"}],"collection":[{"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/comments?post=6204"}],"version-history":[{"count":5,"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/posts\/6204\/revisions"}],"predecessor-version":[{"id":6780,"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/posts\/6204\/revisions\/6780"}],"wp:attachment":[{"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/media?parent=6204"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/categories?post=6204"},{"taxonomy":"post_tag","embeddable":true,"href":"http:\/\/139.9.1.231\/index.php\/wp-json\/wp\/v2\/tags?post=6204"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}