前言
代码实现
import numpy as np
# 初始数据
def loadSimpData():
dataMat = np.array([
[1., 2.1],
[2., 1.1],
[1.3, 1.],
[1., 1.],
[2., 1.],
])
classLabels = [1.0, 1.0, -1.0, -1.0, 1.0]
return dataMat, classLabels
def stumpClassify(dataMatrix, dimen, threshVal, threshIneq):
"""
对数据进行分类哦
:param dataMatrix: 数据矩阵
:param dimen: 即对哪一列的数据进行处理
:param threshVal: 阈值
:param threshIneq: 标识符,即需要设置为-1的数据是大于阈值还是小于阈值
:return: 返回数据的类别矩阵
"""
# 创建形状为(样本个数, 1)形状的反馈矩阵,初始化为1
retArray = np.ones((np.shape(dataMatrix)[0], 1))
if threshIneq == 'lt':
retArray[dataMatrix[:, dimen] <= threshVal] = -1.0
else:
retArray[dataMatrix[:, dimen] > threshVal] = -1.0
return retArray
def buildStump(dataArr, classLabels, D):
"""
找到数据集上面最佳单层决策树
:param dataArr: 样本矩阵
:param classLabels: 标签
:param D: 权重矩阵
:return: 最佳单层决策树的相关变量
"""
dataMatrix = np.mat(dataArr)
labelMat = np.mat(classLabels).T
# m:样本总数,n:列数
m, n = np.shape(dataMatrix)
numSteps = 10.0
bestStump = {}
bestClasEst = np.mat(np.zeros((m, 1)))
minError = np.inf
# 遍历所有特征
for i in range(n):
# 获取该列的最大和最小值
rangeMin = dataMatrix[:, i].min()
rangeMax = dataMatrix[:, i].max()
# 计算步长
stepSize = (rangeMax - rangeMin) / numSteps
for j in range(-1, int(numSteps) + 1):
# 对该特征的不同值进行遍历
for inequal in ['lt', 'gt']:
# 获取阈值
threshVal = (rangeMin + float(j) * stepSize)
# 对该特征的值进行分类
predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal)
errArr = np.mat(np.ones((m, 1)))
errArr[predictedVals == labelMat] = 0
weightError = D.T * errArr
# 计算加权错误率
print("split: dim %d, thresh %.2f, thresh inequal: %s, the weightederror is %.3f" % (i, threshVal, inequal, weightError))
if weightError < minError:
minError = weightError
bestClasEst = predictedVals.copy()
bestStump['dim'] = i
bestStump['thresh'] = threshVal
bestStump['ineq'] = inequal
# bestStump:最佳单层决策树;minError:最小错误率;bestClassEst:估计的类别向量:
return bestStump, minError, bestClasEst
def adaBoostTrainDS(dataArr, classLabels, numIt=40):
# 弱分类器数组
weakClassArr = []
# 获取样本数
m = np.shape(dataArr)[0]
# 设置权重
D = np.mat(np.ones((m, 1))/m)
aggClassEst = np.mat(np.zeros((m, 1)))
for i in range(numIt):
bestStump, error, classEst = buildStump(dataArr, classLabels, D)
print("D:", D.T)
alpha = float(0.5 * np.log((1.0 - error) / max(error, 1e-16)))
bestStump['alpha'] = alpha
weakClassArr.append(bestStump)
print("classEst:", classEst.T)
# 为下一次迭代计算D
expon = np.multiply(-1 * alpha * np.mat(classLabels).T, classEst)
D = np.multiply(D, np.exp(expon))
D = D / D.sum()
# 错误率累加计算
aggClassEst += alpha * classEst
print("aggClassEst:", aggClassEst.T)
aggError = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T, np.ones((m, 1)))
errorRate = aggError.sum() / m
print("total error:", errorRate, "\n")
if errorRate == 0.0:
break
return weakClassArr
def adaClassify(datToClass, classifierArr):
dataMatrix = np.mat(datToClass)
m = np.shape(dataMatrix)[0]
aggClassEst = np.mat(np.zeros((m, 1)))
for i in range(len(classifierArr)):
classEst = stumpClassify(dataMatrix, classifierArr[i]['dim'], classifierArr[i]['thresh'], classifierArr[i]['ineq'])
aggClassEst += classifierArr[i]['alpha'] * classEst
print(aggClassEst)
return np.sign(aggClassEst)
if __name__ == "__main__":
datMat, classLabels = loadSimpData()
# D = np.mat(np.ones((5, 1))/5)
# buildStump(datMat, classLabels, D)
# print(np.shape(datMat[0]))
classifierArray = adaBoostTrainDS(datMat, classLabels, 30)
# adaClassify([0, 0], classifierArray)
result = adaClassify([[5, 5], [0, 0]], classifierArray)
print(result)