1.jieba.analyse.extract_tags(text) text必须是一连串的字符串才可以
第一步:进行语料库的读取
第二步:进行分词操作
第三步:载入停用词,同时对分词后的语料库进行停用词的去除
第四步:选取一段文本分词列表,串接成字符串,使用jieba.analyse.extract_tags提取主题词
import pandas as pdimport numpy as npimport jieba# 1.导入数据语料的新闻数据df_data = pd.read_table('data/val.txt', names=['category', 'theme', 'URL', 'content'], encoding='utf-8')# 2.对语料库进行分词操作df_contents = df_data.content.values.tolist()# list of list 结构Jie_content = []for df_content in df_contents: split_content = jieba.lcut(df_content) if len(split_content) > 1 and split_content != '\t\n': Jie_content.append(split_content)# 3. 导入停止词的语料库, sep='\t'表示分隔符, quoting控制引号的常量, names=列名, index_col=False,不用第一列做为行的列名, encodingstopwords = pd.read_csv('stopwords.txt', sep='\t', quoting=3, names=['stopwords'], index_col=False, encoding='utf-8')print(stopwords.head())# 对文本进行停止词的去除def drop_stops(Jie_content, stopwords): clean_content = [] all_words = [] for j_content in Jie_content: line_clean = [] for line in j_content: if line in stopwords: continue line_clean.append(line) all_words.append(line) clean_content.append(line_clean) return clean_content, all_words# 将DateFrame的stopwords数据转换为list形式stopwords = stopwords.stopwords.values.tolist()clean_content, all_words = drop_stops(Jie_content, stopwords)print(clean_content[0])#4. 使用jieba分词器,提取文本的关键字import jieba.analyseindex = 2000content_word = ''.join(clean_content[index])content_text = ' '.join(jieba.analyse.extract_tags(content_word, topK=5, withWeight=False))print(content_word)print(content_text)