1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194
|
def load_data_set(): """ 创建数据集,都是假的 fake data set :return: 单词列表posting_list, 所属类别class_vec """ posting_list = [ ['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], ['stop', 'posting', 'stupid', 'worthless', 'gar e'], ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']] class_vec = [0, 1, 0, 1, 0, 1] return posting_list, class_vec
def create_vocab_list(data_set): """ 获取所有单词的集合 :param data_set: 数据集 :return: 所有单词的集合(即不含重复元素的单词列表) """ vocab_set = set() for item in data_set: vocab_set = vocab_set | set(item) return list(vocab_set)
def set_of_words2vec(vocab_list, input_set): """ 遍历查看该单词是否出现,出现该单词则将该单词置1 :param vocab_list: 所有单词集合列表 :param input_set: 输入数据集 :return: 匹配列表[0,1,0,1...],其中 1与0 表示词汇表中的单词是否出现在输入的数据集中 """ result = [0] * len(vocab_list) for word in input_set: if word in vocab_list: result[vocab_list.index(word)] = 1 else: pass return result
def _train_naive_bayes(train_mat, train_category): """ 朴素贝叶斯分类原版 :param train_mat: type is ndarray 总的输入文本,大致是 [[0,1,0,1], [], []] :param train_category: 文件对应的类别分类, [0, 1, 0], 列表的长度应该等于上面那个输入文本的长度 :return: """ train_doc_num = len(train_mat) words_num = len(train_mat[0]) pos_abusive = np.sum(train_category) / train_doc_num p0num = np.zeros(words_num) p1num = np.zeros(words_num)
p0num_all = 0 p1num_all = 0
for i in range(train_doc_num): if train_category[i] == 1: p1num += train_mat[i] p1num_all += np.sum(train_mat[i]) else: p0num += train_mat[i] p0num_all += np.sum(train_mat[i]) p1vec = p1num / p1num_all p0vec = p0num / p0num_all return p0vec, p1vec, pos_abusive
def train_naive_bayes(train_mat, train_category): """ 朴素贝叶斯分类修正版, 注意和原来的对比,为什么这么做可以查看书 :param train_mat: type is ndarray 总的输入文本,大致是 [[0,1,0,1], [], []] :param train_category: 文件对应的类别分类, [0, 1, 0], 列表的长度应该等于上面那个输入文本的长度 :return: """ train_doc_num = len(train_mat) words_num = len(train_mat[0]) pos_abusive = np.sum(train_category) / train_doc_num p0num = np.ones(words_num) p1num = np.ones(words_num) p0num_all = 2.0 p1num_all = 2.0
for i in range(train_doc_num): if train_category[i] == 1: p1num += train_mat[i] p1num_all += np.sum(train_mat[i]) else: p0num += train_mat[i] p0num_all += np.sum(train_mat[i]) p1vec = np.log(p1num / p1num_all) p0vec = np.log(p0num / p0num_all) return p0vec, p1vec, pos_abusive
def classify_naive_bayes(vec2classify, p0vec, p1vec, p_class1): """ 使用算法: # 将乘法转换为加法 乘法: P(C|F1F2...Fn) = P(F1F2...Fn|C)P(C)/P(F1F2...Fn) 加法: P(F1|C)*P(F2|C)....P(Fn|C)P(C) -> log(P(F1|C))+log(P(F2|C))+....+log(P(Fn|C))+log(P(C)) :param vec2classify: 待测数据[0,1,1,1,1...],即要分类的向量 :param p0vec: 类别0,即正常文档的[log(P(F1|C0)),log(P(F2|C0)),log(P(F3|C0)),log(P(F4|C0)),log(P(F5|C0))....]列表 :param p1vec: 类别1,即侮辱性文档的[log(P(F1|C1)),log(P(F2|C1)),log(P(F3|C1)),log(P(F4|C1)),log(P(F5|C1))....]列表 :param p_class1: 类别1,侮辱性文件的出现概率 :return: 类别1 or 0 """ p1 = np.sum(vec2classify * p1vec) + np.log(p_class1) p0 = np.sum(vec2classify * p0vec) + np.log(1 - p_class1) if p1 > p0: return 1 else: return 0
def bag_words2vec(vocab_list, input_set): result = [0] * len(vocab_list) for word in input_set: if word in vocab_list: result[vocab_list.index(word)] += 1 else: print('the word: {} is not in my vocabulary'.format(word)) return result
def testing_naive_bayes(): """ 测试朴素贝叶斯算法 :return: no return """ list_post, list_classes = load_data_set() vocab_list = create_vocab_list(list_post)
train_mat = [] for post_in in list_post: train_mat.append( set_of_words2vec(vocab_list, post_in) ) p0v, p1v, p_abusive = train_naive_bayes(np.array(train_mat), np.array(list_classes)) test_one = ['love', 'my', 'dalmation'] test_one_doc = np.array(set_of_words2vec(vocab_list, test_one)) print('the result is: {}'.format(classify_naive_bayes(test_one_doc, p0v, p1v, p_abusive))) test_two = ['stupid', 'garbage'] test_two_doc = np.array(set_of_words2vec(vocab_list, test_two)) print('the result is: {}'.format(classify_naive_bayes(test_two_doc, p0v, p1v, p_abusive)))
if __name__ == "__main__": testing_naive_bayes()
|