ML - 天真貝氏法 Naïve-Bayes－ricky10116r2d2的部落格

ML - 天真貝氏法 Naïve-Bayes

簡單貝氏模型直接假設所有的隨機變數之間具有條件獨立的情況，因此可以直接利用條件機率相乘的方法，計算出聯合機率分布

==============

參考

https://zh.wikipedia.org/wiki/%E6%9C%B4%E7%B4%A0%E8%B4%9D%E5%8F%B6%E6%96%AF%E5%88%86%E7%B1%BB%E5%99%A8

https://www.youtube.com/watch?v=FsvNJUIMSh8

https://blog.csdn.net/u010918541/article/details/70880607

https://www.cnblogs.com/yuxc/archive/2011/07/10/2102559.html

https://blog.csdn.net/amds123/article/details/70173402

=============

# https://blog.csdn.net/u010918541/article/details/70880607

from numpy import *
import numpy

#　创建一个没有重复词表的list
def createVocabList(dataSet):
    vocabSet = set([]) #create empty set
    for document in dataSet:
        vocabSet = vocabSet | set(document) # union of the two sets
    return list(vocabSet)                    # a = t | s # t 和 s的并集

# 查看词表在每个句子中是否出现
def setOfWords2Vec(vocabList, inputSet): # inputSet=> words 每句話 (共8句話)
    returnVec = [0]*len(vocabList)       # 84
    for word in inputSet:

        if word in vocabList:

            returnVec[vocabList.index(word)] = 1
    #print ('returnVec',returnVec)
    return returnVec
'''
inputSet
'My uncle Sammy was an angry man He had printed on his tombstone What are you looking at'

'returnVec' 在 84 個單字裡面，有出現這句話的內容為1 沒有為0
[0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]

'''

dataset=['There is only one love happiness in this life to love and love be loved',
    'Being deeply loved by love someone gives you strength while loving someone deeply gives you courage',
    'Success is no accident It is hard work perseverance learning studying sacrifice and most of all love of what you are doing or learning to do',
    'Choose a job you love and you will never have to work a day in your life',
    'For every minute you remain angry you give up sixty seconds of peace of mind',
    'When angry count to ten before you speak If very angry count to one hundred',
    'When angry count to four when very angry swear',
    'My uncle Sammy was an angry man He had printed on his tombstone What are you looking at',
    ]
classset=[1,1,1,1,0,0,0,0] # dataset => 前面4句是LOVE 用1代表，後面 4句是Angry 用0代表
testdata=['I m still pretty self-centered greedy and angry',
   'Love yourself It is important to stay positive because beauty comes from the inside out']

listofTocken=[]
testofTocken=[]
for i in range(len(dataset)):
listofTocken.append(dataset[i].lower().split()) # 訓練夾:每個 Array 裡把文字切開
for i in range(len(testdata)):
testofTocken.append(testdata[i].lower().split()) # 測試夾: 每個 Array 裡把文字切開

#创建一个没有重复词语的词表
mVocablist=createVocabList(listofTocken) #84 #　mVocablist= 创建一个没有重复词表的list

''' mVocablist
['while', 'printed', 'your', 'only', 'in', 'an', 'is', 'perseverance', 'deeply', 'man',
'you', 'choose', 'remain', 'happiness', 'was', 'no', 'every', 'there', 'at', 'doing',
'give', 'four', 'hard', 'love', 'loving', 'tombstone', 'courage', 'someone', 'it',
'very', 'or', 'never', 'a', 'one', 'studying', 'when', 'he', 'his', 'and', 'being',
'work', 'life', 'sixty', 'job', 'up', 'looking', 'most', 'before', 'seconds',
'strength', 'if', 'my', 'this', 'minute', 'are', 'have', 'to', 'all', 'uncle',
'hundred', 'swear', 'of', 'had', 'speak', 'loved', 'mind', 'day', 'ten', 'angry',
'sammy', 'be', 'what', 'do', 'accident', 'peace', 'count', 'on', 'sacrifice',
'for', 'learning', 'by', 'gives', 'will', 'success']
'''

#创建一个词表　在每个小文档中是否出现的集合　词表词出现为１　不出现为０
trainMat=[]

for words in listofTocken:    # 词表词出现为１　不出现为０

''' words
['my', 'uncle', 'sammy', 'was', 'an', 'angry', 'man', 'he', 'had', 'printed',
'on', 'his', 'tombstone', 'what', 'are', 'you', 'looking', 'at']
'''
trainMat.append(setOfWords2Vec(mVocablist,words)) # 查看词表在每个句子中是否出现

# 總共有8句話，可以拆成 84個不同單字
# trainMat 有 8個代表8句話，每句話用84個不同單字表示 1表示這個單字有出現在這句話 !!

#创建测试testMat
testMat=[]
for words in testofTocken:
testMat.append(setOfWords2Vec(mVocablist,words)) # # 词表词出现为１　不出现为０

#需要训练的文章数
numOftrainDocs=len(trainMat)                    # 8句話
#每一篇有多少个数据　其实就是词表的个数大小
numwords=len(trainMat[0])                       # 84

#计算的love　quote的概率
pLove=sum(classset)/float(numOftrainDocs) # 4/8

#初始化两个计算love 和　angry 的数组
ploveNum=ones(numwords) # 84 個 1
pangryNum=ones(numwords) # 84 個 1

ploveDenom=2.0
pangryDenom=2.0

for i in range(numOftrainDocs): # 8句話
if classset[i]==1:
  #是love 分组的　这篇文档 (就是前4句啦!!! 屬於 LOVE)
  ploveNum= ploveNum + trainMat[i]
  #此处　ploveNum 是一个向量　[0,0,0,0,0,0,0.....0] traingMat 也是一个向量　[0,0,0,1,0,0,1,0,....0,1,0]　结果还是向量
  ploveDenom=ploveDenom + sum(trainMat[i]) #此处是对traingmat 中１求和结果是浮点数 (65)
else:                        #(就是後4句啦!!! 屬於angry)
  #是angry 分组的　这篇文档
  pangryNum+=trainMat[i]
  #同上　依然是向量
  pangryDenom+=sum(trainMat[i]) #此处是对traingmat 中１求和结果是浮点数 (52)

ploveVect=log(ploveNum/ploveDenom) #属于love 各个词的概率大小
pangryVect=log(pangryNum/ploveDenom) #属于angty 各个词的概率大小

#print ploveVect,pangryVect
#print '总的love 文档概率　',pLove

def testNB(doc,ploveV,pangryV,pL):   # doc = testMat[i] 共兩句話 , pl =love　quote的概率
plove=sum(doc*ploveV)+log(pL)
pangry=sum(doc*pangryV)+log(1.0-pL)
if plove>pangry:
  return "love quote"
else:
  return "angry quote"


for i in range(len(testMat)):
print (testdata[i], 'is @@@',testNB(testMat[i],ploveVect,pangryVect,pLove))

ricky10116r2d2

ricky10116r2d2的部落格

ricky10116r2d2 發表在痞客邦留言(0) 人氣()

E-mail轉寄

ricky10116r2d2的部落格

歡迎光臨ricky10116r2d2在痞客邦的小天地

ML - 天真貝氏法 Naïve-Bayes

歷史上的今天

留言列表

站方公告

活動快報

天海旅...

我的好友

熱門文章

文章分類

C 語言 (1)

網頁技術 (2)

DataBase 資料庫 (0)

Python (9)

最新文章

最新留言

動態訂閱

文章精選

文章搜尋

新聞交換(RSS)

誰來我家

參觀人氣

QR Code

POWERED BY