-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathSpark_danmu_analyze.py
216 lines (200 loc) · 9.26 KB
/
Spark_danmu_analyze.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
# -*- coding: utf-8 -*-
"""
Created on Mon Jan 8 21:18:23 2018
@author: KAI
"""
#%%============准备工作=========#
from pyspark import SparkConf
from pyspark import SparkContext
import matplotlib.pyplot as plt
import matplotlib.dates as mdate
import pandas as pd
import matplotlib as mlp
import numpy as np
import time
mlp.rcParams['font.family']='sans-serif'
mlp.rcParams['font.sans-serif']=[u'SimHei']
mlp.rcParams['axes.labelsize']=20
mlp.rcParams['xtick.labelsize']=20
mlp.rcParams['ytick.labelsize']=20
mlp.rcParams['figure.figsize']=(8,6)
rid=input('输入房间号:')
txt_name=str(rid)+'_'+str(time.strftime("%d_%m_%Y"))+'.txt'
#txt_name=str(rid)+'_04_02_2018.txt'
#txt = open(txt_name+'.txt','r',encoding='gbk')
conf = SparkConf().setMaster("local[*]").setAppName("DouyuApp")
sc = SparkContext(conf=conf)#local本地URL #APPname
#为了解决sc读gbk的问题
danmulist=sc.textFile(txt_name)
danmu_data=danmulist.map(lambda lines:lines.strip('\n').split('->'))
danmu_data.cache()
#建立Result.txt文件,准备写入结果
result=open('Result_'+txt_name+'.txt','a+')
#%%=========总弹幕数===========#
Total_danmu_num=danmu_data.count()
print("弹幕总数:",Total_danmu_num,file=result)
#%%=========总观众数===========#
Total_pop=danmu_data.map(lambda x:x[0]).distinct().count()
print("观众老爷总数:",Total_pop,file=result)#以id统计
#%%==========弹幕颜色============#
'''
colors=danmu_data.map(lambda x:(int(x[-2]),1)).reduceByKey(lambda x,y:x+y).collect()
colors=np.array(colors)
#print(colors)
white_num=colors[0,1]
other_num=sum(colors[1:,1])
print('白色弹幕:',white_num,file=result)
print('有色弹幕: ',other_num,file=result)
for i in colors[1:]:
print("%s,有 %d 条,占有色弹幕的 %.1f %%"%(translate_to_color.ttc(i[0]),i[1],i[1]/other_num*100),file=result)
'''
#%%============话痨===============#
NUM=5#榜单TOP NUM
#id+nickname作为一个整体出现
Tops=danmu_data.map(lambda x:((int(x[0]),x[1]),1)).reduceByKey(lambda x,y:x+y).sortBy(lambda x:x[1],ascending=False).take(NUM)
print('话痨榜: ',Tops,file=result)
#%%===========观众等级分布============#
#id+level作为筛选标准
levels=danmu_data.map(lambda x:(int(x[0]),int(x[2]))).groupByKey().map(lambda x:(x[0],np.ceil(np.median(list(x[1])))))
levels=np.array(levels.collect())[:,1]
print("观众等级中值:%d"%(np.median(levels),),file=result)
lbins=np.arange(min(levels),max(levels),2)
plt.figure(1)
plt.hist(levels,lbins,histtype='bar',facecolor='orange',edgecolor='black',alpha=0.75,rwidth=0.8)
plt.xlabel("等级区间")
plt.xlim(min(levels),max(levels))
plt.ylabel("出现频率")
#plt.title("观众等级分布")
plt.savefig('%s Audience level.png'%(txt_name,),dpi=300)
#%%==========观众前五牌子比例======#
Badges=danmu_data.filter(lambda x:x[-2]!='NONE')
#id+badge作为一个整体,不重复计数
Top_badges_list=Badges.map(lambda x:(int(x[0]),x[-2])).distinct().groupBy(lambda x:x[1]).map(lambda x:(x[0],len(x[1]))).sortBy(lambda x:x[1],ascending=False).take(NUM)
print(Top_badges_list[0][0]+"的牌子数共",Top_badges_list[0][1],file=result)
#以下为画图
Top_badges=np.array(Top_badges_list)
labels=Top_badges[:,0]
sizes=Top_badges[:,1].astype(np.int)/sum(Top_badges[:,1].astype(np.int))
colors='lightcoral','gold','lightskyblue','yellow','yellowgreen'
explode=0.5,0.4,0.2,0.2,0
plt.figure(2)
_,t_text,p_text=plt.pie(tuple(sizes),labels=tuple(labels), explode=explode,colors=colors, autopct='%1.1f%%',
shadow=True, startangle=50)
for p,t in zip(p_text,t_text):
t.set_size=(40)
p.set_size=(40)
plt.axis('equal')
#plt.title("观众前五牌子比例")
plt.savefig('%s TOP5 badge.png'%(txt_name,),dpi=300)
#%%=========观众牌子等级分布==========#
#id+badge_level作为一个整体
badge_levels_raw=Badges.filter(lambda x:x[-2]=='%s'%(Top_badges_list[0][0],))
badge_levels=badge_levels_raw.map(lambda x:(x[0],int(x[-1]))).groupByKey().map(lambda x:(x[0],np.ceil(np.median(list(x[1])))))
blevels=np.array(badge_levels.collect())[:,1]
blevels=blevels.astype(np.float)
print("牌子的平均等级:%1.1f"%(np.mean(blevels),),file=result)
bbins=np.arange(min(blevels),max(blevels),1)
plt.figure(3)
plt.hist(blevels,bbins,histtype='bar',facecolor='darkblue',edgecolor='black',alpha=0.75,rwidth=0.8)
plt.xlabel("等级区间")
plt.ylabel("出现频率")
#plt.title("%s牌子等级分布"%(labels[0],))
plt.savefig('%s badge level.png'%(txt_name,),dpi=300)
#%%=========弹幕热词============#
import jieba.analyse as ana
#jieba 中文分词
import jieba.posseg as psg
danmu_content=danmu_data.map(lambda x:x[3]).collect()
#print(danmu_content)
content=" ".join(danmu_content)
#print(content)
#danmu_words=jieba.cut_for_search(content)
danmu_words_flags=[(x.word,x.flag) for x in psg.cut(content)]#获取词的属性以便过滤
stop_attr = ['a','b','c','d','f','df','p','r','rr','s','t','u','ule','ude1','v','z','x','y','e']
stop_word = ['了','的','吧','吗','个','人','部','1','2','3','4','一']
# 过滤掉不需要的词性的词
Topwords = [x[0] for x in danmu_words_flags if x[1] not in stop_attr and x[0] not in stop_word]
from collections import Counter
c = Counter(Topwords).most_common(50)
#以词云显示
from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator
'''
text=dict(c)
backgroud_Image = plt.imread('cover_chicken.jpg')
#backgroud_Image = plt.imread('cover_love.jpg')
wc = WordCloud( background_color = 'white', # 设置背景颜色
mask = backgroud_Image, # 设置背景图片
max_words = 200, # 设置最大现实的字数
stopwords = STOPWORDS, # 设置停用词
font_path = './fonts/simhei.ttf',# 设置字体格式,如不设置显示不了中文
max_font_size = 30,# 设置字体最大值
width=1000,
height=860,
#min_font_size = 10,
random_state = 24, # 设置有多少种随机生成状态,即有多少种配色方案
)
wc.generate_from_frequencies(text)
image_colors = ImageColorGenerator(backgroud_Image)
wc.recolor(color_func = image_colors)
'''
text=dict(c)
wc = WordCloud(background_color="lightyellow",width=1200, height=860, margin=2,font_path = './fonts/simhei.ttf',).generate_from_frequencies(text)
#wc = WordCloud(background_color="lightyellow",width=1200, height=860, margin=2,font_path = './fonts/simhei.ttf',
# stopwords=STOPWORDS).generate(content)
plt.imshow(wc)
plt.axis('off')
plt.savefig('%s hot_words.png'%(txt_name,),dpi=600)
print('今日热词: ',c,file=result)
#关键词提取
#jieba.analyse.extract_tags(sentence,topK,withWeight) #需要先 import jieba.analyse TF-IDF算法
#Textrank算法类似于Pagerank
Keywords=ana.textrank(content, topK = 20, withWeight = False, allowPOS = ('ns', 'n', 'v', 'nv')) #注意默认过滤词性。
print('今日弹幕关键词: ',Keywords,file=result)
#%%============贵族弹幕===========#
'''
vip=danmu_data.filter(lambda x:x[-1]=='b\'1\'')
#Tops=danmu_data.map(lambda x:((int(x[0]),x[1]),1)).reduceByKey(lambda x,y:x+y).sortBy(lambda x:x[1],ascending=False).take(NUM)
vip_content=vip.map(lambda x:(x[1],1)).reduceByKey(lambda x,y:x+y).sortBy(lambda x:x[1],ascending=False).take(NUM)
#print(vip.collect())
print("贵族弹幕有 ",vip.count(),file=result)
print("VIP节奏位 ",vip_content,file=result)
'''
#%%=========关注数变化========热度变化===========#
txt_name_room=str(rid)+'_'+str(time.strftime("%d_%m_%Y"))+'room.txt'
#txt_name_room=str(rid)+'_04_02_2018room.txt'
roominformation=sc.textFile(txt_name_room)
roominfo=roominformation.map(lambda lines:lines.strip('\n').split('|'))
roominfo=np.array(roominfo.collect())[:,0:2].astype(np.int)
#print(roominfo)
hot=roominfo[:,0]
fans=roominfo[:,1]
print('今日关注量增长约: ',fans[-1]-fans[0],file=result)
print('今日热度峰值: ',np.max(hot),file=result)
print('今日热度中值: ',round(np.median(hot)),file=result)
#plt.figure(4)
fig,ax1 = plt.subplots()
ax2 = ax1.twinx()
#ax1.xaxis.set_major_formatter(mdate.DateFormatter('%Y-%m-%d %H:%M:%S'))#设置时间标签显示格式
#plt.xticks(pd.date_range('2018-02-04 09:00:44','2018-02-04 15:54:44',freq='5min'))#时间间隔
plt.xticks(rotation=30)
ax1.plot(fans, 'ro-',label='关注量',linewidth=3.0)
ax2.plot(hot, 'm^-',label='热度',linewidth=3.0)
ax1.set_xlabel('时间序列')
ax1.set_ylabel('关注量')
ax2.set_ylabel('热度')
#ax1.legend(loc=2,prop={'size': 6})
#ax2.legend(loc=1,prop={'size': 6})
#plt.title("主播热度和关注量的变化")
plt.savefig('%s hot and fans.png'%(txt_name,),dpi=300)
#%%========礼物=============#
txt_name_gift=str(rid)+'_'+str(time.strftime("%d_%m_%Y"))+'gift.txt'
#txt_name_gift=str(rid)+'_04_02_2018gift.txt'
giftlist=sc.textFile(txt_name_gift)
gift_info=giftlist.map(lambda lines:lines.strip('\n').split('->'))
gift=gift_info.groupBy(lambda x:x[1]).map(lambda x:(x[0],len(x[1])))
print("礼物统计: ",gift.collect(),file=result)
gift_vip=gift_info.map(lambda x:((x[0],x[1]),1)).reduceByKey(lambda x,y:x+y).sortBy(lambda x:x[1],ascending=False).take(NUM)
print("礼物榜: ",gift_vip,file=result)
#%%==========收尾===========#
result.close()
plt.show()