close

ML - 天真貝氏法 Naïve-Bayes

 

簡單貝氏模型直接假設所有的隨機變數之間具有條件獨立的情況,因此可以直接利用條件機率相乘的方法,計算出聯合機率分布

 

 

==============

參考

https://zh.wikipedia.org/wiki/%E6%9C%B4%E7%B4%A0%E8%B4%9D%E5%8F%B6%E6%96%AF%E5%88%86%E7%B1%BB%E5%99%A8

https://www.youtube.com/watch?v=FsvNJUIMSh8

https://blog.csdn.net/u010918541/article/details/70880607

https://www.cnblogs.com/yuxc/archive/2011/07/10/2102559.html

https://blog.csdn.net/amds123/article/details/70173402

=============

 

# https://blog.csdn.net/u010918541/article/details/70880607

from numpy import *
import numpy

# 创建一个没有重复词表的list
def createVocabList(dataSet):
    vocabSet = set([])  #create empty set
    for document in dataSet:
        vocabSet = vocabSet |  set(document) # union of the two sets 
    return list(vocabSet)                    # a = t | s  # t 和 s的并集

# 查看词表在每个句子中是否出现
def setOfWords2Vec(vocabList, inputSet): # inputSet=> words 每句話 (共8句話)
    returnVec = [0]*len(vocabList)       # 84 
    for word in inputSet:
       
        if word in vocabList:
           
            returnVec[vocabList.index(word)] = 1
    #print ('returnVec',returnVec)       
    return returnVec
'''
inputSet
'My uncle Sammy was an angry man He had printed on his tombstone What are you looking at'

'returnVec'  在 84 個單字裡面,有出現這句話的內容為1 沒有為0
[0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]

'''

dataset=['There is only one love happiness in this life to love and love be loved',
    'Being deeply loved by love someone gives you strength while loving someone deeply gives you courage',
    'Success is no accident It is hard work perseverance learning studying sacrifice and most of all love of what you are doing or learning to do',
    'Choose a job you love and you will never have to work a day in your life',
    'For every minute you remain angry you give up sixty seconds of peace of mind',
    'When angry count to ten before you speak If very angry count to one hundred',
    'When angry count to four when very angry swear',
    'My uncle Sammy was an angry man He had printed on his tombstone What are you looking at',
    ]
classset=[1,1,1,1,0,0,0,0] # dataset => 前面4句是LOVE 用1代表, 後面 4句是Angry 用0代表
testdata=['I m still pretty self-centered greedy and angry',
   'Love yourself It is important to stay positive because beauty comes from the inside out']

listofTocken=[]
testofTocken=[]
for i in range(len(dataset)):
 listofTocken.append(dataset[i].lower().split())   # 訓練夾:每個 Array 裡把文字切開
for i in range(len(testdata)):
 testofTocken.append(testdata[i].lower().split())  # 測試夾: 每個 Array 裡把文字切開
 
#创建一个没有重复词语的词表
mVocablist=createVocabList(listofTocken)  #84  # mVocablist= 创建一个没有重复词表的list


''' mVocablist
['while', 'printed', 'your', 'only', 'in', 'an', 'is', 'perseverance', 'deeply', 'man',
 'you', 'choose', 'remain', 'happiness', 'was', 'no', 'every', 'there', 'at', 'doing',
 'give', 'four', 'hard', 'love', 'loving', 'tombstone', 'courage', 'someone', 'it',
 'very', 'or', 'never', 'a', 'one', 'studying', 'when', 'he', 'his', 'and', 'being',
 'work', 'life', 'sixty', 'job', 'up', 'looking', 'most', 'before', 'seconds',
 'strength', 'if', 'my', 'this', 'minute', 'are', 'have', 'to', 'all', 'uncle',
 'hundred', 'swear', 'of', 'had', 'speak', 'loved', 'mind', 'day', 'ten', 'angry',
 'sammy', 'be', 'what', 'do', 'accident', 'peace', 'count', 'on', 'sacrifice',
 'for', 'learning', 'by', 'gives', 'will', 'success']
'''


#创建一个词表 在每个小文档中是否出现的集合 词表词出现为1 不出现为0
trainMat=[]

for words in listofTocken:    # 词表词出现为1 不出现为0
 
  ''' words
  ['my', 'uncle', 'sammy', 'was', 'an', 'angry', 'man', 'he', 'had', 'printed',
  'on', 'his', 'tombstone', 'what', 'are', 'you', 'looking', 'at']
  '''
  trainMat.append(setOfWords2Vec(mVocablist,words)) # 查看词表在每个句子中是否出现
 
  # 總共有8句話,可以拆成 84個不同單字
  # trainMat 有 8個代表8句話 , 每句話 用84個不同單字表示  1表示這個單字有出現在這句話 !!
 
#创建测试testMat
testMat=[]
for words in testofTocken:
 testMat.append(setOfWords2Vec(mVocablist,words)) # # 词表词出现为1 不出现为0
 
#需要训练的文章数
numOftrainDocs=len(trainMat)                    # 8句話
#每一篇有多少个数据 其实就是词表的个数大小
numwords=len(trainMat[0])                       # 84

#计算的love quote的概率
pLove=sum(classset)/float(numOftrainDocs)  # 4/8

#初始化两个计算love 和 angry 的数组
ploveNum=ones(numwords)       # 84 個 1
pangryNum=ones(numwords)      # 84 個 1

ploveDenom=2.0
pangryDenom=2.0

for i in range(numOftrainDocs):  # 8句話
 if classset[i]==1:
  #是love 分组的 这篇文档 (就是前4句啦!!! 屬於 LOVE)
  ploveNum= ploveNum + trainMat[i]
  #此处 ploveNum 是一个向量 [0,0,0,0,0,0,0.....0] traingMat 也是一个向量 [0,0,0,1,0,0,1,0,....0,1,0] 结果还是向量
  ploveDenom=ploveDenom + sum(trainMat[i]) #此处是对traingmat 中1求和 结果是浮点数 (65)
 else:                        #(就是後4句啦!!! 屬於angry)
  #是angry 分组的 这篇文档
  pangryNum+=trainMat[i]
  #同上 依然是向量
  pangryDenom+=sum(trainMat[i]) #此处是对traingmat 中1求和 结果是浮点数 (52)

ploveVect=log(ploveNum/ploveDenom)     #属于love 各个词的概率大小
pangryVect=log(pangryNum/ploveDenom)   #属于angty 各个词的概率大小

#print ploveVect,pangryVect
#print '总的love 文档概率 ',pLove

def testNB(doc,ploveV,pangryV,pL):   # doc = testMat[i] 共兩句話 , pl =love quote的概率
 plove=sum(doc*ploveV)+log(pL)
 pangry=sum(doc*pangryV)+log(1.0-pL)
 if plove>pangry:
  return "love quote"
 else:
  return "angry quote"
   
   
for i  in range(len(testMat)):
 print (testdata[i], 'is @@@',testNB(testMat[i],ploveVect,pangryVect,pLove))

 

 

arrow
arrow
    全站熱搜
    創作者介紹
    創作者 ricky10116r2d2 的頭像
    ricky10116r2d2

    ricky10116r2d2的部落格

    ricky10116r2d2 發表在 痞客邦 留言(0) 人氣()