分词:jieba.cut
words = jieba.cut("我来到北京大学",cut_all=True)print('全模式:'+'/'.join([w for w in words])) #全模式words = jieba.cut("我来到北京大学",cut_all=False)print('精确模式:'+'/'.join([w for w in words])) #精确模式,默认words = jieba.cut_for_search("小明毕业于北京大学,后在美国哈佛大学深造")print('/'.join([w for w in words])) #搜索引擎模式,在精确模式的基础上,对长词在此划分
全模式:我/来到/北京/北京大学/大学精确模式:我/来到/北京大学请练习添加自定义词典
词性:jieba.posseg
import jieba.posseg as pgfor word, flag in pg.cut("你想去学校填写学生寒暑假住校申请表吗?"):print('%s %s' % (word, flag))
'你/学校/填写/学生/寒暑假/住校/申请表'分词引入停用词
import jiebaimport pandas as pdimport numpy as nppaths = '中英文停用词.xlsx'dfs = pd.read_excel(paths,dtype=str)stopwords = ['想','去','吗','?']words = jieba.cut("你想去学校填写学生寒暑假住校申请表吗?")'/'.join([w for w in words if (w not in stopwords)])#此处’/'表示换行
'你/学校/填写/学生/寒暑假/住校/申请表'txt转dataframe函数
import randomimport jieba.posseg as pgimport pandas as pdimport numpy as npdef generatorInfo(file_name):# 读取文本文件with open(file_name, encoding='utf-8') as file:line_list = [k.strip() for k in file.readlines()]data = https://www.huyubaike.com/biancheng/[]for k in random.sample(line_list,1000):t = k.split(maxsplit=1)#data_label_list.append(t[0])#data_content_list.append(t[1])data.append([t[0],' '.join([w for w,flag in pg.cut(t[1]) if (w not in dfs['stopwords']) and (w !=' ') and (len(w)>=2)])])return datafile_name = 'cnews.train.txt'df = pd.DataFrame(np.array(generatorInfo(file_name)),columns=['类别','分词'])path = '训练集分词结果(随机选取1000个样本).xlsx'df.to_excel(path,index=False)df

文章插图
词云图:wordcloud
%pylab inlineimport matplotlib.pyplot as pltfrom wordcloud import WordCloudtext = ' '.join(list(df['分词']))wcloud = WordCloud(font_path='simsun.ttc', #字体路径background_color='white', #指定背景颜色max_words=500,#词云显示最大词数max_font_size=150,#指定最大字号#mask = mask #背景图片) wcloud = wcloud.generate(text)#生成词云plt.imshow(wcloud)plt.axis('off')plt.show()

文章插图
提取关键词:jieba.analyse.extract_tags
import jieba.analyseimport pandas as pdimport numpy as nppath = '训练集分词结果(随机选取1000个样本).xlsx'df = pd.read_excel(path,dtype=str)s = ' '.join(list(df['分词']))for w,x in jieba.analyse.extract_tags(s,withWeight=True):print('%s %s' % (w,x))

文章插图
请练习基于TextRank算法抽取关键词
import jieba.analyseimport pandas as pdimport numpy as nppath = '训练集分词结果(随机选取1000个样本).xlsx'df = pd.read_excel(path,dtype=str)tag = list(set(list(df['类别'])))for t in tag:s = ' '.join(list(df[df['类别']==t]['分词']))print(t)for w,x in jieba.analyse.extract_tags(s,withWeight=True):print('%s %s' % (x,w))

文章插图
构建词向量构建词向量简单的有两种分别是TfidfTransformer和 CountVectorizer
#CountVectorizer会将文本中的词语转换为词频矩阵from sklearn.feature_extraction.text import CountVectorizerpath = '训练集分词结果(随机选取1000个样本).xlsx'df = pd.read_excel(path,dtype=str)corpus = df['分词']#vectorizer = CountVectorizer(max_features=5000)vectorizer = CountVectorizer()X = vectorizer.fit_transform(corpus)print(X)

文章插图
from sklearn.feature_extraction.text import TfidfTransformerimport datetimestarttime = datetime.datetime.now()transformer = TfidfTransformer()tfidf = transformer.fit_transform(X)word = vectorizer.get_feature_names()weight = tfidf.toarray()print(weight)

文章插图
词语分类:人工vsKmeans
from sklearn.cluster import KMeansstarttime = datetime.datetime.now()path = '训练集分词结果(随机选取1000个样本).xlsx'df = pd.read_excel(path,dtype=str)corpus = df['分词']kmeans=KMeans(n_clusters=10)#n_clusters:number of clusterkmeans.fit(weight)res = [list(df['类别']),list(kmeans.labels_)]df_res = pd.DataFrame(np.array(res).T,columns=['人工分类','Kmeans分类'])path_res = 'Kmeans自动分类结果.xlsx'df_res.to_excel(path_res,index=False)df_res
推荐阅读
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- 《上传那些事儿之Nest与Koa》——文件格式怎么了!
- 创造与魔法9月16日最新礼包兑换码是什么
- 光与夜之恋晴空漫野谈活动怎么玩
- 前端监控系列4 | SDK 体积与性能优化实践
- 其三 Gitea 1.18 功能前瞻:增强文本预览效果、继续扩展软件包注册中心、增强工单实用功能、完善了用户邀请机制和SEO
- 消息队列之RabbitMQ介绍与运用
- 奇迹暖暖晚钟与祈祷怎么搭配才能高分
- 电视猫和路由器怎么连接(路由器与电视盒子连接)
- 路由器与猫怎样正确链接(一个猫怎么连接三个路由器)
- java中的垃圾回收算法与垃圾回收器