Naive Bayes 在机型转换分类中的实现

本文最后更新于:2022年4月24日 下午

Naive Bayes 在机型转换分类中的实现

1
2
3
4
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
1
2
3
4
5
6
def splitWord(orgin):
#分词函数
#输入:orgin,列表形式,如['Xiaomi:Coolpad8297']
#输出:全部转换为小写,按冒号、下划线、空格分隔,如['xiaomi', 'coolpad8297']
wordSplit=re.split(r'\s+|_|\W',orgin)##分词:冒号、下划线、空格
return [tok.lower() for tok in wordSplit if len(tok) > 1]
1
2
3
4
5
def dataToList(data):
##将读入的DataFrame转换成列表
dataArray=np.array(data)
dataList=dataArray.tolist()##转换成列表形式
return dataList
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
def dataStat(dataList):
#数据统计,统计分词及y值出现的频率
wordList=[]##所有词的列表,有重复元素
yList=[]##所有y值的列表,有重复元素
for i in dataList:
wordList += splitWord(i[0])
yList.append(i[1])
wordNoRepet=list(set(wordList))##无重复的词的列表
#print(wordSet)
yNoRepet = list(set(yList))##无重复的y值列表
yFre=[]##每个y值出现的频率
yNum=len(yList)##y值数量,有重复
for y in yNoRepet:
yFre.append(yList.count(y)/yNum)
wordFre=[]##每个词出现的概率
wordNum=len(wordList)##所有词的总数,有重复
for word in wordNoRepet:
wordFre.append(wordList.count(word)/wordNum)
return wordList,wordNoRepet,wordFre,yList,yNoRepet,yFre
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
def y_splitX(dataList):
#按y值整理x,整理后的ytest无重复y值,如['OPPOFind7(X9077/标准版/移动4G)', '飞利浦I999(双4G)']
#与之对应的xtestword,将所有标准化名称为'OPPOFind7(X9077/标准版/移动4G)'的x值分词后得到['oppo', 'find7', 'oppo', 'find', 'oppo', 'find', 'oppo', 'find', 'oppo', 'find7',
#'移动4g版', 'x9007', 'oppo', 'find7', 'oppo', 'find7', '4g', 'oppo', 'find7', 'obbo', 'find', '4g', 'oppo', 'find7', 'x9077', 'oppo', 'x9077', 'oppo',
#'find7', 'oppo', 'find7', 'oppo', 'find', 'oppo', 'x9007', 'oppo', 'find7', 'find7', 'oppo', 'x9007', 'oppo', 'find7']
ytest=[]##不重复的y值列表
xtest=[]##每个不重复y值对应的所有x值
xtestword=[]##每个不重复y值对应的所有x值分词后的结果
for xy in dataList:
if xy[1] in ytest:
indexY=ytest.index(xy[1])
xtest[indexY].append(xy[0])
else:
ytest.append(xy[1])
xtest.append([xy[0]])
for xword in xtest:
if len(xword)==1:
xtestword.append(splitWord(xword[0]))
else:
xxsplit=[]
for xx in xword:
xxsplit += splitWord(xx)
xtestword.append(xxsplit)
return ytest,xtest,xtestword
1
2
3
4
5
6
7
8
def listSplit(xList):
#输入含有x和y的列表,返回分词后的x列表和未分词的y列表
xListSplit=[]
listY=[]
for row in xList:
xListSplit.append(splitWord(row[0]))
listY.append(row[1])
return xListSplit,listY

修改了概率计算方法:
现有机型['huawei g6:gg'],分词后得到['huawei','g6','gg']。测试数据处理后'华为Ascend G6(移动版)'对应的所有分词为['huawei','g6','4g'],并没有出现分词'gg',则

$P(华为Ascend G6(移动版) | [‘huawei’,’g6’,’gg’]) ==P(华为Ascend G6(移动版) | ‘huawei’)+P(华为Ascend G6(移动版) | ‘g6’)+P(华为Ascend G6(移动版) | ‘gg’)$

$P(华为Ascend G6(移动版) | ‘huawei’) =\frac{P(‘huawei’|华为Ascend G6(移动版))*P(华为Ascend G6(移动版))}{P(‘huawei’)}$

$P(华为Ascend G6(移动版) | ‘g6’) =\frac{P(‘g6’|华为Ascend G6(移动版))*P(华为Ascend G6(移动版))}{P(‘g6’)}$

令 $P(华为Ascend G6(移动版) | ‘g6’) = 0$

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
def predict(xlabel,yTrain,xTrainSplit,yNoRepet,wordNoRepet,wordFre):
##输入数据为分词后的一个x列表,如['huawei','g6']
#输出预测概率proba:0.3138868073768786
#输出预测结果predictY:'华为Ascend G6(移动版)'
probability=[]
for classy0,classy1 in zip(yTrain,xTrainSplit):
prob=0
for word in xlabel:
if word in classy1 and word in wordNoRepet :
prob += (\
((classy1.count(word)/len(classy1))*(yFre[yNoRepet.index(classy0)]))/ \
wordFre[wordNoRepet.index(word)])
else:
prob = max(prob,0)
probability.append(prob)
proba=max(probability)
predictY=yTrain[probability.index(proba)]
return proba,predictY,probability
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
def classfy(xTestSplit,yTest,yTrain,xTrainSplit,yNoRepet):
#输入测试集的分词后的x值
#输出预测结果,预测概率及准确率
allpredictProb=[]##对所有行的预测概率
allpredictY=[]
count=0
error=0
for row0,row1 in zip(xTestSplit,yTest):
proba,predictY,probability= predict(row0,yTrain,xTrainSplit,yNoRepet,wordNoRepet,wordFre)
allpredictProb.append(proba)
allpredictY.append(predictY)
if row1 == predictY:
count += 1
else:
error += 1
predictAccuracy = count/len(xTestSplit)
return allpredictProb,allpredictY,predictAccuracy,count
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#导入数据,导入之后为DataFrame格式
orgindata=pd.read_csv('C:/Users/Administrator/Desktop/data.csv',encoding='utf-8',usecols=[1,2])
#分成训练集和测试集
train,test=train_test_split(orgindata,test_size=0.05)
#将DataFrame格式转换成list
trainList=dataToList(train)
#计算各分词及y值出现的频率
wordList,wordNoRepet,wordFre,yList,yNoRepet,yFre=dataStat(trainList)
#将训练集按y值整理
yTrain,xTrain,xTrainSplit=y_splitX(trainList)
#将测试集转换成list
testList=dataToList(test)
#对测试集x值分词
xTestSplit,yTest=listSplit(testList)
#对测试集进行预测
allpredictProb,allpredictY,predictAccuracy,count=classfy(xTestSplit,yTest,yTrain,xTrainSplit,yNoRepet)
#输出准确率
print('PredictAccuracy is %.2f%%'%(predictAccuracy*100))
PredictAccuracy is 74.52%
1
2
3
#测试
proba,predictY,probability= predict(['g6','huawei'],yTrain,xTrainSplit,yNoRepet,wordNoRepet,wordFre)
print('预测机型为:%s,概率为:%.2f%%'%(predictY,proba*100))
预测机型为:华为Ascend G6(移动版),概率为:24.46%