前言

最近一直在看机器学习相关的书,看了太多理论,却没有实战,所以打算跟着《机器学习实战》实现一些算法,除了代码还加入了注释以便于理解。

使用numpy简单实现

import numpy as np
import operator


# 创建数据集
def createDataSet():
    group = np.array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
    labels = ['A', 'A', 'B', 'B']
    return group, labels


def classify0(inX, dataSet, labels, k):
    # 首先获取训练样本的数量
    dataSetSize = dataSet.shape[0]
    # np.tile:将按照维度复制;dataSetSize表示为纵向复制dataSetSize倍;1表示横向复制1倍即不复制
    # 计算输入样本和训练样本对应坐标的差
    diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet
    # 计算坐标差的平方然后相加,axis=0表示跨行,axis=1表示跨列
    sqDiffMat = diffMat ** 2
    sqDistance = sqDiffMat.sum(axis=1)
    distances = sqDistance ** 0.5
    # argsort是将元素按照升序排列,并返回对应的索引值,即小的在前面,大的在后面
    sortedDistIndicies = distances.argsort()
    classCount = {}
    # 选择距离最小的k个点
    for i in range(k):
        # 获取距离最小的点的label
        voteIlabel = labels[sortedDistIndicies[i]]
        # 获取计数字典里面对应标签的值并加一,其实就是计算投票值,值最大对应的标签,即为输入样本的标签
        classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
    # operator.itemgetter(1):即按照第一个域进行排序,reverse默认为false表示为升序,reverse=True表示降序
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    # 返回投票数最高的标签
    return sortedClassCount[0][0]


if __name__ == "__main__":
    group, labels = createDataSet()
    target_label = classify0([1, 1], group, labels, 3)
    print("输入样本的类别为:", target_label)

使用k近邻算法改进约会网站的配对效果

# 使用k近邻法改进约会网站的配对效果
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import kNN as myKnn


# 读取文本数据
def file2matrix(filename):
    fr = open(filename)
    array0lines = fr.readlines()
    # 获得文件行数
    number0fLines = len(array0lines)
    # 创建对应格式的numpy矩阵
    returnMat = np.zeros((number0fLines, 3))
    classLabelVector = []
    index = 0
    for line in array0lines:
        # strip用于去除字符串头尾的空格
        line = line.strip()
        listFromLine = line.split('\t')
        returnMat[index, :] = listFromLine[0:3]
        # -1表示最后一列元素
        classLabelVector.append(int(listFromLine[-1]))
        index += 1
    fr.close()
    return returnMat, classLabelVector


# 归一化
def autoNorm(dataSet):
    # 取每一列的最小值,形状为:行数*1
    minVals = dataSet.min(0)
    # print("minVlas:", minVals)
    # 取第二列中最小的值
    maxVals = dataSet.max(0)
    # 数值范围
    ranges = maxVals - minVals
    normDataSet = np.zeros(np.shape(dataSet))
    # 获取行数
    m = dataSet.shape[0]
    normDataSet = dataSet - np.tile(minVals, (m, 1))
    normDataSet = normDataSet / np.tile(ranges, (m, 1))
    return normDataSet, ranges, minVals


# 可视化数据
def plotData():
    fig = plt.figure()
    ax = fig.add_subplot(111)
    # [:, 1]:即在所有数组中取序号为1的数据
    ax.scatter(datingDataMat[:, 1], datingDataMat[:, 2], 15.0*np.array(datingLabels), 15.0*np.array(datingLabels))
    plt.show()


# 测试代码正确率
def datingClassTest():
    hoRatio = 0.10
    datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
    normMat, ranges, minVals = autoNorm(datingDataMat)
    m = normMat.shape[0]
    numTestVecs = int(m*hoRatio)
    errorCount = 0.0
    for i in range(numTestVecs):
        classifierResult = myKnn.classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
        print("the classifier came back with:%d, the real anser is:%d"%(classifierResult, datingLabels[i]))
        if classifierResult != datingLabels[i]:
            errorCount += 1.0
    print("the total error rate is:%f"%(errorCount / float(numTestVecs)))


# 正式程序
def classifyPerson():
    resultList = ['not at all', 'in small doses', 'in large doses']
    percentTats = float(input("percentage of time spent playing video games?"))
    ffMiles = float(input("frequent flier miles earned per year?"))
    iceCream = float(input("liters of ice cream consumed per year?"))
    datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
    normMat, ranges, minVals = autoNorm(datingDataMat)
    inArr = np.array([ffMiles, percentTats, iceCream, ])
    classifierResult = myKnn.classify0((inArr - minVals)/ranges, normMat, datingLabels, 3)
    print("You will probably like this person: %s" % resultList[classifierResult - 1])


if __name__ == "__main__":
    datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
    plotData()
    normMat, ranges, minVals = autoNorm(datingDataMat)
    classifyPerson()
最后修改:2021 年 02 月 25 日
如果觉得我的文章对你有用,请随意赞赏