博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
bayes
阅读量:4955 次
发布时间:2019-06-12

本文共 5743 字,大约阅读时间需要 19 分钟。

from numpy import *import timestarttime = time.time()def loadDataSet():     postingList = [['my', 'dog', 'has', 'flea',                    'problems', 'help', 'please'],                    ['maybe', 'not', 'take', 'him',                    'to', 'dog', 'park', 'stupid'],                    ['my', 'dalmation', 'is', 'so', 'cute',                    'I', 'love', 'him'],                    ['stop', 'posting', 'stupid', 'worthless',                     'garbage'],                    ['mr', 'licks', 'ate', 'my', 'steak', 'how',                    'to', 'stop', 'him'],                    ['quit', 'buying', 'worthless', 'dog', 'food',                    'stupid']]    classVec = [0, 1, 0, 1, 0, 1]     return postingList, classVecdef createVocabList(dataSet): # dataSet = postingList     vocabSet = set([]) # vocabSet = set(dataSet)    for document in dataSet:        vocabSet = vocabSet | set(document) #     return list(vocabSet) # createVocabList = list(set(dataSet)) def setOfWords2Vec(vocabList, inputSet):     returnVec = [0] * len(vocabList) # [0, 0 , 0 ,0,..] len(vocabList)  0    for word in vocabList:        if word in inputSet:            returnVec[vocabList.index(word)] = 1 + 1.0        else:            returnVec[vocabList.index(word)] = 1.0            print "the word: %s is not in my Vocabulary!" % word    return returnVec def txt2trainxy(filename1, filename2):    import re    reg = re.compile(r'\W*') #    # step 1: loading data...    print "stet 1: loading data..."    from os import listdir    ld1 = listdir('email/' + filename1); ld2 = listdir('email/' + filename2)    filelist = ld1 + ld2    trainy = ((filename1 + '\t') * len(ld1) + (filename2 + '\t') * len(ld2)).split()        trainx = []; fulltext = []; i = 0    for File in filelist:        if i < len(ld1):            fr = reg.split(open('email/' + filename1 + '/' + File).readlines()[0].lower())        else:            fr = reg.split(open('email/' + filename2 + '/' + File).readlines()[0].lower())        trainx.append([f for f in fr if len(f) > 2]) #        fulltext.extend([f for f in fr if len(f) > 2]) #        i += 1    fulltext = list(set(fulltext))    # set of words    trainxws = [[list(set(item)).count(strg) + 1.0 for strg in fulltext] for item in trainx]    # bag of words     trainxwb = [[item.count(strg) + 1.0 for strg in fulltext] for item in trainx]    return trainxws, trainxwb, trainy, trainx, fulltextdef testx2vec(testx, fulltext):    # set of words    testxws = [list(set(testx)).count(strg) + 1.0 for strg in fulltext] #    # bag of words     testxwb = [testx.count(strg) + 1.0 for strg in fulltext] #    for word in testx:        if word not in fulltext:            print "the word: %s is not in my fulltext!" % word    return testxws, testxwbdef bayes(testx, trainx, trainy, fulltext):    print "---Getting Prob..."    s = set(trainy); l = len(trainy); r = len(trainx[0])    IDs = [[id for id in range(l) if trainy[id] == item] for item in s]    logproby = [log(array(trainy.count(item)) / float(l)) for item in s]    numbxv = [sum([trainx[id] for id in ids], 0) for ids in IDs]    numbx = [sum([trainx[id] for id in ids]) + 2.0 for ids in IDs] #    probx = [numbxv[i] / float(numbx[i]) for i in range(len(s))]    logprobx = [[log(p[i]) for i in range(r)] for p in probx]    print "---Printing Prob..."    #print probx    print [fulltext[i] for i in (-array(probx)).argsort()[:,: 5][0]] # argsort() small to big    print trainy[IDs[0][0]]    print [fulltext[i] for i in (-array(probx)).argsort()[:,: 5][1]]    print trainy[IDs[1][0]]    """    print IDs    print numbxv    print logprobx    """    # step 4: showing the result...    print "---Showing the result..."    # set of words    sumlogpxws = sum(array(logprobx) * testx, 1)    sumlogpxyws = array(sumlogpxws) + array(logproby)    #print logprobx    print sumlogpxws    print sum(array(probx) * testx, 1)    bestyws = trainy[IDs[sumlogpxyws.argmax()][0]]    print "---From set of words: ", bestyws    """    # bag of words    sumlogpxwb = sum(array(logprobx) * testxwb, 1)    sumlogpxywb = array(sumlogpxwb) + array(logproby)    bestywb = trainy[IDs[sumlogpxywb.argmax()][0]]    print "---From bag of words: ", bestywb    """    return bestyws    def main():    # step 1: loading data...    trainxws, trainxwb, trainy, trainx, fulltext = txt2trainxy('spam','ham')    print fulltext    # step 2: training...    print "step 2: training..."    pass    # step 3: testing...    print "step 3: testing..."    print "---Preparing testdata..."    import random    l = len(trainy)    testid = random.sample(range(l), 20)    testxxx = [trainxws[i] for i in testid]    testyyy = [trainy[i] for i in testid]    testtrainxws = [trainxws[i] for i in range(l) if i not in testid]    testtrainy = [trainy[i] for i in range(l) if i not in testid]    print "---Testing now..."    errorcount = 0; p = len(testid)    for i in range(p):        if bayes(testxxx[i], testtrainxws, testtrainy, fulltext) != testyyy[i]:            errorcount += 1    print errorcount    print p    print "---Errorrate is: ", (errorcount / float(p))    # step 4: showing the result    print "step 4: using..."    testx = ['love', 'my', 'dalmation']    print "the testx is: ", testx    print "---Changing testx into vector..."    testxws, testxwb = testx2vec(testx, fulltext)    #print testxws    bayes(testxws, testtrainxws, testtrainy, fulltext)main()"""trainx, trainy = loadDataSet()fulltext = createVocabList(trainx)print fulltextprint setOfWords2Vec(fulltext, trainx[0])trainxws = []for t in trainx:    trainxws.append(setOfWords2Vec(fulltext, t))testEntry1 = ['love', 'my', 'dalmation']testEntry2 = ['stupid', 'garbage']bayes(testEntry1, trainxws, trainy, fulltext)"""

 

转载于:https://www.cnblogs.com/monne/p/4249324.html

你可能感兴趣的文章
javascript运算符的优先级
查看>>
React + Redux 入门(一):抛开 React 学 Redux
查看>>
13位时间戳和时间格式化转换,工具类
查看>>
vue router-link子级返回父级页面
查看>>
C# 通知机制 IObserver<T> 和 IObservable<T>
查看>>
Code of Conduct by jsFoundation
查看>>
div 只显示两行超出部分隐藏
查看>>
C#小练习ⅲ
查看>>
debounce、throttle、requestAnimationFrame
查看>>
linux下的C语言快速学习—进程和文件
查看>>
电源防反接保护电路
查看>>
stm32 堆和栈(stm32 Heap & Stack)
查看>>
SpringMVC从入门到精通之第三章
查看>>
arraylist
查看>>
zoj 1649 Rescue (BFS)(转载)
查看>>
2124: 等差子序列 - BZOJ
查看>>
字符串匹配算法综述
查看>>
Linux centosVMware shell 管道符和作业控制、shell变量、环境变量配置文件
查看>>
【设计模式】工厂模式
查看>>
两个表格中数据不用是一一对应关系--来筛选不同数据,或者相同数据
查看>>