import re import pandas as pd import numpy as np from sklearn.model_selection import train_test_split
1 2 3 4 5 6
defsplitWord(orgin): #分词函数 #输入:orgin,列表形式,如['Xiaomi:Coolpad8297'] #输出:全部转换为小写,按冒号、下划线、空格分隔,如['xiaomi', 'coolpad8297'] wordSplit=re.split(r'\s+|_|\W',orgin)##分词:冒号、下划线、空格 return [tok.lower() for tok in wordSplit iflen(tok) > 1]
defdataStat(dataList): #数据统计,统计分词及y值出现的频率 wordList=[]##所有词的列表,有重复元素 yList=[]##所有y值的列表,有重复元素 for i in dataList: wordList += splitWord(i[0]) yList.append(i[1]) wordNoRepet=list(set(wordList))##无重复的词的列表 #print(wordSet) yNoRepet = list(set(yList))##无重复的y值列表 yFre=[]##每个y值出现的频率 yNum=len(yList)##y值数量,有重复 for y in yNoRepet: yFre.append(yList.count(y)/yNum) wordFre=[]##每个词出现的概率 wordNum=len(wordList)##所有词的总数,有重复 for word in wordNoRepet: wordFre.append(wordList.count(word)/wordNum) return wordList,wordNoRepet,wordFre,yList,yNoRepet,yFre
defpredict(xlabel,yTrain,xTrainSplit,yNoRepet,wordNoRepet,wordFre): ##输入数据为分词后的一个x列表,如['huawei','g6'] #输出预测概率proba:0.3138868073768786 #输出预测结果predictY:'华为Ascend G6(移动版)' probability=[] for classy0,classy1 inzip(yTrain,xTrainSplit): prob=0 for word in xlabel: if word in classy1 and word in wordNoRepet : prob += (\ ((classy1.count(word)/len(classy1))*(yFre[yNoRepet.index(classy0)]))/ \ wordFre[wordNoRepet.index(word)]) else: prob = max(prob,0) probability.append(prob) proba=max(probability) predictY=yTrain[probability.index(proba)] return proba,predictY,probability