#-*- coding: UTF-8 -*-
__author__ = 'Administrator'
import pymysql
import math
import pandas
import jieba
import jieba.analyse
from snownlp import SnowNLP
import nltk
from nltk.classify import NaiveBayesClassifier
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
class dboUtils(object):
def __init__(self):
self.conn = pymysql.connect(host="192.168.1.1", port=3306, user="root", password="123", db="stet",
charset="utf8")
def query(self, sql = ''):
return self.cursor
def main1():
res = dbo.query("SELECT id,title,content FROM topic")
res = res.fetchall()
data = []
for i, idx in enumerate(res):
id = idx[0]
text = idx[2]
if text != '':
nlp = SnowNLP(text)
nlp = nlp.sentiments
dict = {}
dict['id'] = id
dict['title'] = text
dict['score'] = nlp
print(text + " : " + str(nlp))
# data.append(dict)
return data
def getKeywordIds(text, verb1, verb2 ):
keywordids = []
problem = []
for word in word_lst:
idx = verb1.get(word)
if idx != None:
keywordids.append(str(idx))
else:
for word_idx in verb1:
if text.find(word_idx) >= 0:
idx = verb1.get(word_idx)
keywordids.append(str(idx))
for word in word_lst:
idx = verb2.get(word)
if idx != None:
problem.append(str(idx))
else:
for word_idx in verb2:
if text.find(word_idx) >= 0:
idx = verb2.get(word_idx)
problem.append(str(idx))
return [",".join(list(set(keywordids))),",".join(list(set(problem)))]
def main2():
keyword = dbo.query("SELECT id,name,type FROM keyword WHERE appid = 1")
keyword = keyword.fetchall()
# 数据准备
positive_vocab = {}
negative_vocab = {}
neutral_vocab = {}
problom_vocab = {}
for i, idx in enumerate(keyword):
if idx[2] == 1:
positive_vocab[idx[1]] = idx[0]
elif idx[2] == 2:
problom_vocab[idx[1]] = idx[0]
elif idx[2]
negative_vocab[idx[1]] = idx[0]
else:
neutral_vocab[idx[1]] = idx[0]
# 特征提取
positive_features = [(word_feats(pos), 'pos') for pos in positive_vocab.keys()]
negative_features = [(word_feats(neg), 'neg') for neg in negative_vocab.keys()]
neutral_features = [(word_feats(neu), 'neu') for neu in neutral_vocab.keys()]
train_set = negative_features + positive_features + neutral_features
# 训练
classifier = NaiveBayesClassifier.train(train_set)
# 测试
res = dbo.query("SELECT id,content FROM comments")
res = res.fetchall()
data = []
for i, idx in enumerate(res):
id = idx[0]
text = idx[1]
if text != '':
neg = 0
pos = 0
word_lst = WordPunctTokenizer().tokenize(text)
word_lst = [w for w in word_lst if (w not in stopwords.words('stop_word'))]
for word in word_lst:
classResult = classifier.classify(word_feats(word))
if classResult == 'neg':
neg = neg + 1
if classResult == 'pos':
pos = pos + 1
# score_nltk = 0
# if pos > 0 and neg > 0 :
# score_nltk = float(pos) / len(word_lst)
# if score_nltk > 0.5:
# score_nltk = 1
# elif score_nltk > 0:
# score_nltk = -1
# else:
# score_nltk = 0
# else:
# nlp = SnowNLP(text)
# score = nlp.sentiments
nlp = SnowNLP(text)
score = nlp.sentiments
# print(word_lst)
# print(positive_vocab)
# print(text + " pos: " + str(pos) + " neg: " + str(neg))
# print(text+" pos: "+str(pos)+" neg: "+str(neg)+" : jieba&nltk : "+str(score)+" : nlp : "+str(nlp))
score = round(score,2)
if score > 0.5:
score = 1
keyids = getKeywordIds(text, positive_vocab, problom_vocab )
elif score > 0:
score = -1
keyids = getKeywordIds(text, negative_vocab, problom_vocab)
else:
score = 0
keyids = getKeywordIds(text, negative_vocab, problom_vocab)
dict = {}
dict['id'] = id
dict['score'] = score
# dict['score_nltk'] = score_nltk
dict['keywordids'] = keyids[0]
dict['problem'] = keyids[1]
# dict['content'] = text
data.append(dict)
return data
def word_feats(words):
return dict([(word, True) for word in words])
def sortProblem():
res = dbo.query("SELECT id,name FROM keyword WHERE type = 2")
res = res.fetchall()
data = []
for i, idx in enumerate(res):
id = idx[0]
text = idx[2]
if text != '':
nlp = SnowNLP(text)
nlp = nlp.sentiments
dict = {}
dict['id'] = id
dict['title'] = text
dict['score'] = nlp
print(text + " : " + str(nlp))
# data.append(dict)
return data
if __name__ == '__main__':
global dbo
dbo = dboUtils()
# res = dbo.query("SELECT title,content FROM topic WHERE score = 2")
# res = main1()
res = main2()
df = pandas.DataFrame(res)
# res = df.groupby(['score'], as_index=False)['score'].agg({'cnt': 'count'})
res = df.tail(20)
# pd = df.groupby(by=['score'])
# newdf = df.size()
# res = newdf.reset_index(name='times')
# res = sortProblem()
print(res)
领取专属 10元无门槛券
私享最新 技术干货