from numpy import *import timestarttime = time.time()def loadDataSet(): postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], ['stop', 'posting', 'stupid', 'worthless', 'garbage'], ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']] classVec = [0, 1, 0, 1, 0, 1] return postingList, classVecdef createVocabList(dataSet): # dataSet = postingList vocabSet = set([]) # vocabSet = set(dataSet) for document in dataSet: vocabSet = vocabSet | set(document) # return list(vocabSet) # createVocabList = list(set(dataSet)) def setOfWords2Vec(vocabList, inputSet): returnVec = [0] * len(vocabList) # [0, 0 , 0 ,0,..] len(vocabList) 0 for word in vocabList: if word in inputSet: returnVec[vocabList.index(word)] = 1 + 1.0 else: returnVec[vocabList.index(word)] = 1.0 print "the word: %s is not in my Vocabulary!" % word return returnVec def txt2trainxy(filename1, filename2): import re reg = re.compile(r'\W*') # # step 1: loading data... print "stet 1: loading data..." from os import listdir ld1 = listdir('email/' + filename1); ld2 = listdir('email/' + filename2) filelist = ld1 + ld2 trainy = ((filename1 + '\t') * len(ld1) + (filename2 + '\t') * len(ld2)).split() trainx = []; fulltext = []; i = 0 for File in filelist: if i < len(ld1): fr = reg.split(open('email/' + filename1 + '/' + File).readlines()[0].lower()) else: fr = reg.split(open('email/' + filename2 + '/' + File).readlines()[0].lower()) trainx.append([f for f in fr if len(f) > 2]) # fulltext.extend([f for f in fr if len(f) > 2]) # i += 1 fulltext = list(set(fulltext)) # set of words trainxws = [[list(set(item)).count(strg) + 1.0 for strg in fulltext] for item in trainx] # bag of words trainxwb = [[item.count(strg) + 1.0 for strg in fulltext] for item in trainx] return trainxws, trainxwb, trainy, trainx, fulltextdef testx2vec(testx, fulltext): # set of words testxws = [list(set(testx)).count(strg) + 1.0 for strg in fulltext] # # bag of words testxwb = [testx.count(strg) + 1.0 for strg in fulltext] # for word in testx: if word not in fulltext: print "the word: %s is not in my fulltext!" % word return testxws, testxwbdef bayes(testx, trainx, trainy, fulltext): print "---Getting Prob..." s = set(trainy); l = len(trainy); r = len(trainx[0]) IDs = [[id for id in range(l) if trainy[id] == item] for item in s] logproby = [log(array(trainy.count(item)) / float(l)) for item in s] numbxv = [sum([trainx[id] for id in ids], 0) for ids in IDs] numbx = [sum([trainx[id] for id in ids]) + 2.0 for ids in IDs] # probx = [numbxv[i] / float(numbx[i]) for i in range(len(s))] logprobx = [[log(p[i]) for i in range(r)] for p in probx] print "---Printing Prob..." #print probx print [fulltext[i] for i in (-array(probx)).argsort()[:,: 5][0]] # argsort() small to big print trainy[IDs[0][0]] print [fulltext[i] for i in (-array(probx)).argsort()[:,: 5][1]] print trainy[IDs[1][0]] """ print IDs print numbxv print logprobx """ # step 4: showing the result... print "---Showing the result..." # set of words sumlogpxws = sum(array(logprobx) * testx, 1) sumlogpxyws = array(sumlogpxws) + array(logproby) #print logprobx print sumlogpxws print sum(array(probx) * testx, 1) bestyws = trainy[IDs[sumlogpxyws.argmax()][0]] print "---From set of words: ", bestyws """ # bag of words sumlogpxwb = sum(array(logprobx) * testxwb, 1) sumlogpxywb = array(sumlogpxwb) + array(logproby) bestywb = trainy[IDs[sumlogpxywb.argmax()][0]] print "---From bag of words: ", bestywb """ return bestyws def main(): # step 1: loading data... trainxws, trainxwb, trainy, trainx, fulltext = txt2trainxy('spam','ham') print fulltext # step 2: training... print "step 2: training..." pass # step 3: testing... print "step 3: testing..." print "---Preparing testdata..." import random l = len(trainy) testid = random.sample(range(l), 20) testxxx = [trainxws[i] for i in testid] testyyy = [trainy[i] for i in testid] testtrainxws = [trainxws[i] for i in range(l) if i not in testid] testtrainy = [trainy[i] for i in range(l) if i not in testid] print "---Testing now..." errorcount = 0; p = len(testid) for i in range(p): if bayes(testxxx[i], testtrainxws, testtrainy, fulltext) != testyyy[i]: errorcount += 1 print errorcount print p print "---Errorrate is: ", (errorcount / float(p)) # step 4: showing the result print "step 4: using..." testx = ['love', 'my', 'dalmation'] print "the testx is: ", testx print "---Changing testx into vector..." testxws, testxwb = testx2vec(testx, fulltext) #print testxws bayes(testxws, testtrainxws, testtrainy, fulltext)main()"""trainx, trainy = loadDataSet()fulltext = createVocabList(trainx)print fulltextprint setOfWords2Vec(fulltext, trainx[0])trainxws = []for t in trainx: trainxws.append(setOfWords2Vec(fulltext, t))testEntry1 = ['love', 'my', 'dalmation']testEntry2 = ['stupid', 'garbage']bayes(testEntry1, trainxws, trainy, fulltext)"""