使用贝叶斯统计进行垃圾邮件过滤
占个坑,贝叶斯进行垃圾邮件过滤方法简单,效果也挺明显的。基本能满足一般需求。具体算法不多说,google即可。
贝叶斯在线上使用的准确性其实严重依靠人工的调整的,需要进行巧妙的调整才能极大的提高降低误判率。
下面贴个代码,从其他地方考过来改吧改吧的。
#!/usr/bin/env python
#coding:utf-8
import json
import mmseg
class Bayes():
def __init__(self):
self.spam_words_count = 0;
self.healthy_words_count = 0;
self.words_value = {};
def learn(self, sentence, is_spam):
words = mmseg.seg_txt(sentence)
words = list(words)
print json.dumps(words, encoding="UTF-8", ensure_ascii=False)
for flag, word in enumerate(words):
#print str(flag) + " -- " + word
pword = False
if word in self.words_value:
pword = self.words_value[word]
if is_spam :
if not pword:
self.words_value[word] = '1:0'
else:
pword = pword.split(':')
cs = pword[0]
ch = pword[1]
self.words_value[word] = str(int(cs) + 1) + ':' +ch
else:
if not pword:
self.words_value[word] = '0:1'
else:
pword = pword.split(':')
cs = pword[0]
ch = pword[1]
self.words_value[word] = cs + ':' +str(int(ch) + 1);
if is_spam:
self.spam_words_count = self.spam_words_count + 1
else:
self.healthy_words_count = self.healthy_words_count + 1
def cal(self, sentence):
total_s = int(self.spam_words_count)
total_h = int(self.healthy_words_count)
ps = ph = 0.5 # 在判定之前这个邮件是垃圾邮件的概率
words_percent = []
words = mmseg.seg_txt(sentence)
words = set(words)
pws = None
pwh = None
for flag, word in enumerate(words):
#每篇文章包含某词则加一,而不是如果有这个词就加一
value = False
if word in self.words_value:
value = self.words_value[word]
if value:
tvalue = value.split(':')
cs = int(tvalue[0])
ch = int(tvalue[1])
pws = float(cs) / total_s # 所有文章中,出现这个为spam的概率
pwh = float(ch) / total_h # 所有文章中,出现这个为healthy的概率
else:
pws = pwh = 0.01
if pws == 0:
pws = 0.01
if pwh == 0:
pwh = 0.01
psw = ps * pws / (ps * pws + pwh * ph)
words_percent.append(psw)
j = 1
for i in words_percent:
j = j * i
x = 1
for i in words_percent:
x = (1 - i) * x
judge_value = j / (j + x)
return judge_value
def learnFile(self, path, isspam ):
olines = []
ofp = open(path, "r")
try:
olines = set(ofp.readlines())
except Exception as e:
print "read file error!", "file: " + path
print e
finally:
ofp.close();
for l in olines:
self.learn( l, isspam)
return ;
bayes = Bayes()
def main():
test()
def test():
testwords = ['想买魔声耳机的加微信gcy_809718332', '不错,好听', '微信不错']
goodfile = "good.txt"
badfile = "bad.txt"
bayes.learnFile(goodfile, False)
bayes.learnFile(badfile, True)
for w in testwords:
pct = bayes.cal(w)
print w + " -- " + str(pct)
if __name__ == "__main__":
main()
下面写几点实际应用中可能需要注意的问题。
1. 垃圾邮件数据的选取,这里需要人工选取足够的垃圾邮件进去,尽量保持客观;
2.正常邮件的样本选取,这个也是得客观选取一些。
上面2点是废话,值得注意的是,很可能大部分的垃圾邮件里面都含有"QQ", "微信" 等,这样可能会导致正常的含有QQ的邮件被标记为垃圾邮件,这个可以通过人工加入足够多的QQ关键词样本邮件,从而变相的条件QQ的概率。
或者也可以通过加白名单过滤的策略,比如移除助词等。
另外,为了提高准确度,其实还可以考虑条件概率,这样做更准确。比如“代理”出现在“广告”之后的话,更像是垃圾邮件。 这种方法会更加准确。改天弄一个试试。

要点是一个随机指纹产生器,一个是映射器。思路很清晰的,但是没实际用过。@kulv
@你哥
good,回去看看哈。这个做哈希命中判断的吧
布隆过滤器@《数学之美》