k近邻法1968年由Cover
和Hart
提出。
输入:一组训练数据集,特征向量
,及其类别
,给定实例特征向量
输出:实例
所属的类
最邻近的
个点,涵盖这
个点的
的邻域记为
中根据分类决策规则(如,多数表决)决定
的类别
为指示函数,表示当
时
为 1, 否则
为 0 当
时,特殊情况,称为最近邻算法,跟它距离最近的点作为其分类
三要素:k值的选择、距离度量、分类决策规则
近邻模型,三要素确定后,对于任何一个新的输入实例,它的类唯一确定。
空间中两个点的距离是两个实例相似程度的反映。
距离: 设特征
是
维的,
时,
时,
时,它是坐标距离的最大值:
import math
def L_p(xi, xj, p=2):
if len(xi) == len(xj) and len(xi) > 0:
sum = 0
for i in range(len(xi)):
sum += math.pow(abs(xi[i] - xj[i]), p)
return math.pow(sum, 1 / p)
else:
return 0
x1 = [1, 1]
x2 = [5, 1]
x3 = [4, 4]
X = [x1, x2, x3]
for i in range(len(X)):
for j in range(i + 1, len(X)):
for p in range(1, 5):
print("x%d,x%d的L%d距离是:%.2f" % (i + 1, j + 1, p, L_p(X[i], X[j], p)))
x1,x2的L1距离是:4.00
x1,x2的L2距离是:4.00
x1,x2的L3距离是:4.00
x1,x2的L4距离是:4.00
x1,x3的L1距离是:6.00
x1,x3的L2距离是:4.24
x1,x3的L3距离是:3.78
x1,x3的L4距离是:3.57
x2,x3的L1距离是:4.00
x2,x3的L2距离是:3.16
x2,x3的L3距离是:3.04
x2,x3的L4距离是:3.01
值的选择
majority voting rule
)
假设损失函数为0-1损失,对于 的近邻域
的分类是
,那么误分类率是:
要使误分类率最小,那么就让
最大,所以选多数的那个类(经验风险最小化)
,训练集很大时,效率低,不可取
树
树
树是一种对 k 维空间中的实例点进行存储以便对其进行快速检索的树形数据结构。
树是二叉树,表示对k维空间的一个划分(partition)。
树相当于不断地用垂直于坐标轴的超平面将 k 维空间切分,构成一系列的k维超矩形区域。
树的每个结点对应于一个 k 维超矩形区域。
构造
树的方法:
class KdNode():
def __init__(self, dom_elt, split, left, right):
self.dom_elt = dom_elt # k维向量节点(k维空间中的一个样本点)
self.split = split # 整数(进行分割维度的序号)
self.left = left # 该结点分割超平面左子空间构成的kd-tree
self.right = right # 该结点分割超平面右子空间构成的kd-tree
class KdTree():
def __init__(self, data):
k = len(data[0]) # 实例的向量维度
def CreatNode(split, data_set):
if not data_set:
return None
data_set.sort(key=lambda x: x[split])
split_pos = len(data_set) // 2 # 整除
median = data_set[split_pos]
split_next = (split + 1) % k
return KdNode(median, split,
CreatNode(split_next, data_set[:split_pos]),
CreatNode(split_next, data_set[split_pos + 1:]))
self.root = CreatNode(0, data)
def preorder(self, root):
if root:
print(root.dom_elt)
if root.left:
self.preorder(root.left)
if root.right:
self.preorder(root.right)
data = [[2, 3], [5, 4], [9, 6], [4, 7], [8, 1], [7, 2]]
kd = KdTree(data)
kd.preorder(kd.root)
运行结果:
[7, 2]
[5, 4]
[2, 3]
[4, 7]
[9, 6]
[8, 1]
树
给定目标点,搜索其最近邻。
from collections import namedtuple
# 定义一个namedtuple,分别存放最近坐标点、最近距离和访问过的节点数
result = namedtuple("Result_tuple",
"nearest_point nearest_dist nodes_visited")
def find_nearest(tree, point):
k = len(point) # 数据维度
def travel(kd_node, target, max_dist):
if kd_node is None:
return result([0] * k, float("inf"), 0)
# python中用float("inf")和float("-inf")表示正负无穷
nodes_visited = 1
s = kd_node.split # 进行分割的维度
pivot = kd_node.dom_elt # 进行分割的“轴”
if target[s] <= pivot[s]: # 如果目标点第s维小于分割轴的对应值(目标离左子树更近)
nearer_node = kd_node.left # 下一个访问节点为左子树根节点
further_node = kd_node.right # 同时记录下右子树
else: # 目标离右子树更近
nearer_node = kd_node.right # 下一个访问节点为右子树根节点
further_node = kd_node.left
temp1 = travel(nearer_node, target, max_dist) # 进行遍历找到包含目标点的区域
nearest = temp1.nearest_point # 以此叶结点作为“当前最近点”
dist = temp1.nearest_dist # 更新最近距离
nodes_visited += temp1.nodes_visited
if dist < max_dist:
max_dist = dist # 最近点将在以目标点为球心,max_dist为半径的超球体内
temp_dist = abs(pivot[s] - target[s]) # 第s维上目标点与分割超平面的距离
if max_dist < temp_dist: # 判断超球体是否与超平面相交
return result(nearest, dist, nodes_visited) # 不相交则可以直接返回,不用继续判断
# ----------------------------------------------------------------------
# 计算目标点与分割点的欧氏距离
p = np.array(pivot)
t = np.array(target)
temp_dist = np.linalg.norm(p-t)
if temp_dist < dist: # 如果“更近”
nearest = pivot # 更新最近点
dist = temp_dist # 更新最近距离
max_dist = dist # 更新超球体半径
# 检查另一个子结点对应的区域是否有更近的点
temp2 = travel(further_node, target, max_dist)
nodes_visited += temp2.nodes_visited
if temp2.nearest_dist < dist: # 如果另一个子结点内存在更近距离
nearest = temp2.nearest_point # 更新最近点
dist = temp2.nearest_dist # 更新最近距离
return result(nearest, dist, nodes_visited)
return travel(tree.root, point, float("inf")) # 从根节点开始递归
from time import time
from random import random
def random_point(k):
return [random() for _ in range(k)]
def random_points(k, n):
return [random_point(k) for _ in range(n)]
ret = find_nearest(kd, [3, 4.5])
print(ret)
N = 400000
t0 = time()
kd2 = KdTree(random_points(3, N))#40万个3维点(坐标值0-1之间)
ret2 = find_nearest(kd2, [0.1, 0.5, 0.8])
t1 = time()
print("time: ", t1 - t0, " s")
print(ret2)
运行结果:40万个点,只用了4s就搜索完毕,找到最近邻点
Result_tuple(nearest_point=[2, 3], nearest_dist=1.8027756377319946, nodes_visited=4)
time: 4.314465284347534 s
Result_tuple(nearest_point=[0.10186986970329936, 0.5007753108096316, 0.7998708312483109], nearest_dist=0.002028350099282986, nodes_visited=49)
# -*- coding:utf-8 -*-
# @Python Version: 3.7
# @Time: 2020/3/2 22:44
# @Author: Michael Ming
# @Website: https://michael.blog.csdn.net/
# @File: 3.KNearestNeighbors.py
# @Reference: https://github.com/fengdu78/lihang-code
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from collections import Counter
class KNearNeighbors():
def __init__(self, X_train, y_train, neighbors=3, p=2):
self.n = neighbors
self.p = p
self.X_train = X_train
self.y_train = y_train
def predict(self, X):
knn_list = []
# 先在训练集中取n个点出来,计算距离
for i in range(self.n):
dist = np.linalg.norm(X - self.X_train[i], ord=self.p)
knn_list.append((dist, self.y_train[i]))
# 再在剩余的训练集中取出剩余的,计算距离,有距离更近的,替换knn_list里最大的
for i in range(self.n, len(self.X_train)):
max_index = knn_list.index(max(knn_list, key=lambda x: x[0]))
dist = np.linalg.norm(X - self.X_train[i], ord=self.p)
if knn_list[max_index][0] > dist:
knn_list[max_index] = (dist, self.y_train[i])
# 取出所有的n个最近邻点的标签
knn = [k[-1] for k in knn_list]
count_pairs = Counter(knn)
# 次数最多的标签,排序后最后一个 标签:出现次数
max_count = sorted(count_pairs.items(), key=lambda x: x[1])[-1][0]
return max_count
def score(self, X_test, y_test):
right_count = 0
for X, y in zip(X_test, y_test): # zip 同时遍历多个对象
label = self.predict(X)
if math.isclose(label, y, rel_tol=1e-5): # 浮点型相等判断
right_count += 1
print("准确率:%.4f" % (right_count / len(X_test)))
return right_count / len(X_test)
if __name__ == '__main__':
# ---------鸢尾花K近邻----------------
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['label'] = iris.target
plt.scatter(df[:50][iris.feature_names[0]], df[:50][iris.feature_names[1]], label=iris.target_names[0])
plt.scatter(df[50:100][iris.feature_names[0]], df[50:100][iris.feature_names[1]], label=iris.target_names[1])
plt.xlabel(iris.feature_names[0])
plt.ylabel(iris.feature_names[1])
data = np.array(df.iloc[:100, [0, 1, -1]]) # 取前2种花,前两个特征
X, y = data[:, :-1], data[:, -1]
# 切分数据集,留20%做测试数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# KNN算法,近邻选择20个,距离度量L2距离
clf = KNearNeighbors(X_train, y_train, 20, 2)
# 预测测试点,统计正确率
clf.score(X_test, y_test)
# 随意给一个点,用KNN预测其分类
test_point = [4.75, 2.75]
test_point_flower = '测试点' + iris.target_names[int(clf.predict(test_point))]
print("测试点的类别是:%s" % test_point_flower)
plt.plot(test_point[0], test_point[1], 'bx', label=test_point_flower)
plt.rcParams['font.sans-serif'] = 'SimHei' # 消除中文乱码
plt.rcParams['axes.unicode_minus'] = False # 正常显示负号
plt.legend()
plt.show()
准确率:1.0000
测试点的类别是:测试点setosa
sklearn.neighbors.KNeighborsClassifier
class sklearn.neighbors.KNeighborsClassifier(n_neighbors=5, weights='uniform',
algorithm='auto', leaf_size=30, p=2, metric='minkowski',
metric_params=None, n_jobs=None, **kwargs)
from sklearn.neighbors import KNeighborsClassifier
clf_skl = KNeighborsClassifier(n_neighbors=50, p=4, algorithm='kd_tree')
start = time.time()
sum = 0
for i in range(100):
clf_skl.fit(X_train, y_train)
sum += clf_skl.score(X_test, y_test)
end = time.time()
print("平均准确率:%.4f" % (sum/100))
print("花费时间:%0.4f ms" % (1000*(end - start)/100))
# -*- coding:utf-8 -*-
# @Python Version: 3.7
# @Time: 2020/3/2 22:44
# @Author: Michael Ming
# @Website: https://michael.blog.csdn.net/
# @File: 3.KNearestNeighbors.py
# @Reference: https://github.com/fengdu78/lihang-code
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from collections import Counter
import time
def L_p(xi, xj, p=2):
if len(xi) == len(xj) and len(xi) > 0:
sum = 0
for i in range(len(xi)):
sum += math.pow(abs(xi[i] - xj[i]), p)
return math.pow(sum, 1 / p)
else:
return 0
class KNearNeighbors():
def __init__(self, X_train, y_train, neighbors=3, p=2):
self.n = neighbors
self.p = p
self.X_train = X_train
self.y_train = y_train
def predict(self, X):
knn_list = []
# 先在训练集中取n个点出来,计算距离
for i in range(self.n):
dist = np.linalg.norm(X - self.X_train[i], ord=self.p)
knn_list.append((dist, self.y_train[i]))
# 再在剩余的训练集中取出剩余的,计算距离,有距离更近的,替换knn_list里最大的
for i in range(self.n, len(self.X_train)):
max_index = knn_list.index(max(knn_list, key=lambda x: x[0]))
dist = np.linalg.norm(X - self.X_train[i], ord=self.p)
if knn_list[max_index][0] > dist:
knn_list[max_index] = (dist, self.y_train[i])
# 取出所有的n个最近邻点的标签
knn = [k[-1] for k in knn_list]
count_pairs = Counter(knn)
# 次数最多的标签,排序后最后一个 标签:出现次数
max_count = sorted(count_pairs.items(), key=lambda x: x[1])[-1][0]
return max_count
def score(self, X_test, y_test):
right_count = 0
for X, y in zip(X_test, y_test): # zip 同时遍历多个对象
label = self.predict(X)
if math.isclose(label, y, rel_tol=1e-5): # 浮点型相等判断
right_count += 1
print("准确率:%.4f" % (right_count / len(X_test)))
return right_count / len(X_test)
class KdNode():
def __init__(self, dom_elt, split, left, right):
self.dom_elt = dom_elt # k维向量节点(k维空间中的一个样本点)
self.split = split # 整数(进行分割维度的序号)
self.left = left # 该结点分割超平面左子空间构成的kd-tree
self.right = right # 该结点分割超平面右子空间构成的kd-tree
class KdTree():
def __init__(self, data):
k = len(data[0]) # 实例的向量维度
def CreatNode(split, data_set):
if not data_set:
return None
data_set.sort(key=lambda x: x[split])
split_pos = len(data_set) // 2 # 整除
median = data_set[split_pos]
split_next = (split + 1) % k
return KdNode(median, split,
CreatNode(split_next, data_set[:split_pos]),
CreatNode(split_next, data_set[split_pos + 1:]))
self.root = CreatNode(0, data)
def preorder(self, root):
if root:
print(root.dom_elt)
if root.left:
self.preorder(root.left)
if root.right:
self.preorder(root.right)
from collections import namedtuple
# 定义一个namedtuple,分别存放最近坐标点、最近距离和访问过的节点数
result = namedtuple("Result_tuple",
"nearest_point nearest_dist nodes_visited")
def find_nearest(tree, point):
k = len(point) # 数据维度
def travel(kd_node, target, max_dist):
if kd_node is None:
return result([0] * k, float("inf"), 0)
# python中用float("inf")和float("-inf")表示正负无穷
nodes_visited = 1
s = kd_node.split # 进行分割的维度
pivot = kd_node.dom_elt # 进行分割的“轴”
if target[s] <= pivot[s]: # 如果目标点第s维小于分割轴的对应值(目标离左子树更近)
nearer_node = kd_node.left # 下一个访问节点为左子树根节点
further_node = kd_node.right # 同时记录下右子树
else: # 目标离右子树更近
nearer_node = kd_node.right # 下一个访问节点为右子树根节点
further_node = kd_node.left
temp1 = travel(nearer_node, target, max_dist) # 进行遍历找到包含目标点的区域
nearest = temp1.nearest_point # 以此叶结点作为“当前最近点”
dist = temp1.nearest_dist # 更新最近距离
nodes_visited += temp1.nodes_visited
if dist < max_dist:
max_dist = dist # 最近点将在以目标点为球心,max_dist为半径的超球体内
temp_dist = abs(pivot[s] - target[s]) # 第s维上目标点与分割超平面的距离
if max_dist < temp_dist: # 判断超球体是否与超平面相交
return result(nearest, dist, nodes_visited) # 不相交则可以直接返回,不用继续判断
# ----------------------------------------------------------------------
# 计算目标点与分割点的欧氏距离
p = np.array(pivot)
t = np.array(target)
temp_dist = np.linalg.norm(p - t)
if temp_dist < dist: # 如果“更近”
nearest = pivot # 更新最近点
dist = temp_dist # 更新最近距离
max_dist = dist # 更新超球体半径
# 检查另一个子结点对应的区域是否有更近的点
temp2 = travel(further_node, target, max_dist)
nodes_visited += temp2.nodes_visited
if temp2.nearest_dist < dist: # 如果另一个子结点内存在更近距离
nearest = temp2.nearest_point # 更新最近点
dist = temp2.nearest_dist # 更新最近距离
return result(nearest, dist, nodes_visited)
return travel(tree.root, point, float("inf")) # 从根节点开始递归
if __name__ == '__main__':
# ---------计算距离----------------
x1 = [1, 1]
x2 = [5, 1]
x3 = [4, 4]
X = [x1, x2, x3]
for i in range(len(X)):
for j in range(i + 1, len(X)):
for p in range(1, 5):
print("x%d,x%d的L%d距离是:%.2f" % (i + 1, j + 1, p, L_p(X[i], X[j], p)))
# ---------鸢尾花K近邻----------------
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['label'] = iris.target
plt.scatter(df[:50][iris.feature_names[0]], df[:50][iris.feature_names[1]], label=iris.target_names[0])
plt.scatter(df[50:100][iris.feature_names[0]], df[50:100][iris.feature_names[1]], label=iris.target_names[1])
plt.xlabel(iris.feature_names[0])
plt.ylabel(iris.feature_names[1])
data = np.array(df.iloc[:100, [0, 1, -1]]) # 取前2种花,前两个特征
X, y = data[:, :-1], data[:, -1]
# 切分数据集,留20%做测试数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# KNN算法,近邻选择20个,距离度量L2距离
clf = KNearNeighbors(X_train, y_train, 20, 2)
# 预测测试点,统计正确率
clf.score(X_test, y_test)
# 随意给一个点,用KNN预测其分类
test_point = [4.75, 2.75]
test_point_flower = '测试点' + iris.target_names[int(clf.predict(test_point))]
print("测试点的类别是:%s" % test_point_flower)
plt.plot(test_point[0], test_point[1], 'bx', label=test_point_flower)
plt.rcParams['font.sans-serif'] = 'SimHei' # 消除中文乱码
plt.rcParams['axes.unicode_minus'] = False # 正常显示负号
plt.legend()
plt.show()
# ---------sklearn KNN----------
from sklearn.neighbors import KNeighborsClassifier
clf_skl = KNeighborsClassifier(n_neighbors=50, p=4, algorithm='kd_tree')
start = time.time()
sum = 0
for i in range(100):
clf_skl.fit(X_train, y_train)
sum += clf_skl.score(X_test, y_test)
end = time.time()
print("平均准确率:%.4f" % (sum / 100))
print("花费时间:%0.4f ms" % (1000 * (end - start) / 100))
# ------build KD Tree--------------
data = [[2, 3], [5, 4], [9, 6], [4, 7], [8, 1], [7, 2]]
kd = KdTree(data)
kd.preorder(kd.root)
# ------search in KD Tree-----------
from time import time
from random import random
def random_point(k):
return [random() for _ in range(k)]
def random_points(k, n):
return [random_point(k) for _ in range(n)]
ret = find_nearest(kd, [3, 4.5])
print(ret)
N = 400000
t0 = time()
kd2 = KdTree(random_points(3, N))
ret2 = find_nearest(kd2, [0.1, 0.5, 0.8])
t1 = time()
print("time: ", t1 - t0, " s")
print(ret2)