前言
最近一直在看机器学习相关的书,看了太多理论,却没有实战,所以打算跟着《机器学习实战》实现一些算法,除了代码还加入了注释以便于理解。
使用numpy简单实现
import numpy as np
import operator
# 创建数据集
def createDataSet():
group = np.array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
labels = ['A', 'A', 'B', 'B']
return group, labels
def classify0(inX, dataSet, labels, k):
# 首先获取训练样本的数量
dataSetSize = dataSet.shape[0]
# np.tile:将按照维度复制;dataSetSize表示为纵向复制dataSetSize倍;1表示横向复制1倍即不复制
# 计算输入样本和训练样本对应坐标的差
diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet
# 计算坐标差的平方然后相加,axis=0表示跨行,axis=1表示跨列
sqDiffMat = diffMat ** 2
sqDistance = sqDiffMat.sum(axis=1)
distances = sqDistance ** 0.5
# argsort是将元素按照升序排列,并返回对应的索引值,即小的在前面,大的在后面
sortedDistIndicies = distances.argsort()
classCount = {}
# 选择距离最小的k个点
for i in range(k):
# 获取距离最小的点的label
voteIlabel = labels[sortedDistIndicies[i]]
# 获取计数字典里面对应标签的值并加一,其实就是计算投票值,值最大对应的标签,即为输入样本的标签
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
# operator.itemgetter(1):即按照第一个域进行排序,reverse默认为false表示为升序,reverse=True表示降序
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
# 返回投票数最高的标签
return sortedClassCount[0][0]
if __name__ == "__main__":
group, labels = createDataSet()
target_label = classify0([1, 1], group, labels, 3)
print("输入样本的类别为:", target_label)
使用k近邻算法改进约会网站的配对效果
# 使用k近邻法改进约会网站的配对效果
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import kNN as myKnn
# 读取文本数据
def file2matrix(filename):
fr = open(filename)
array0lines = fr.readlines()
# 获得文件行数
number0fLines = len(array0lines)
# 创建对应格式的numpy矩阵
returnMat = np.zeros((number0fLines, 3))
classLabelVector = []
index = 0
for line in array0lines:
# strip用于去除字符串头尾的空格
line = line.strip()
listFromLine = line.split('\t')
returnMat[index, :] = listFromLine[0:3]
# -1表示最后一列元素
classLabelVector.append(int(listFromLine[-1]))
index += 1
fr.close()
return returnMat, classLabelVector
# 归一化
def autoNorm(dataSet):
# 取每一列的最小值,形状为:行数*1
minVals = dataSet.min(0)
# print("minVlas:", minVals)
# 取第二列中最小的值
maxVals = dataSet.max(0)
# 数值范围
ranges = maxVals - minVals
normDataSet = np.zeros(np.shape(dataSet))
# 获取行数
m = dataSet.shape[0]
normDataSet = dataSet - np.tile(minVals, (m, 1))
normDataSet = normDataSet / np.tile(ranges, (m, 1))
return normDataSet, ranges, minVals
# 可视化数据
def plotData():
fig = plt.figure()
ax = fig.add_subplot(111)
# [:, 1]:即在所有数组中取序号为1的数据
ax.scatter(datingDataMat[:, 1], datingDataMat[:, 2], 15.0*np.array(datingLabels), 15.0*np.array(datingLabels))
plt.show()
# 测试代码正确率
def datingClassTest():
hoRatio = 0.10
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m*hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = myKnn.classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
print("the classifier came back with:%d, the real anser is:%d"%(classifierResult, datingLabels[i]))
if classifierResult != datingLabels[i]:
errorCount += 1.0
print("the total error rate is:%f"%(errorCount / float(numTestVecs)))
# 正式程序
def classifyPerson():
resultList = ['not at all', 'in small doses', 'in large doses']
percentTats = float(input("percentage of time spent playing video games?"))
ffMiles = float(input("frequent flier miles earned per year?"))
iceCream = float(input("liters of ice cream consumed per year?"))
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
inArr = np.array([ffMiles, percentTats, iceCream, ])
classifierResult = myKnn.classify0((inArr - minVals)/ranges, normMat, datingLabels, 3)
print("You will probably like this person: %s" % resultList[classifierResult - 1])
if __name__ == "__main__":
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
plotData()
normMat, ranges, minVals = autoNorm(datingDataMat)
classifyPerson()