从零手撸一个人工智障
前言
神经网络Hello World(手写数字识别)。一个小白写的,如有不对的地方请见谅。
前期准备:python(会helloworld)、三分钟热度
导入依赖
import copy
import math
import struct
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np # 矩阵运算
from tqdm import tqdm
激活函数及导数
激活函数可以对输入内容调整。如最后一层调整为各个数字对应的概率
def bypass(x):
return x
def d_bypass(x):
return np.where(x<2, 1, 0)
def relu(x):
return np.maximum(0, x)
def d_relu(x):
return np.where(x > 0, 1, 0)
def softmax(x): # 将数映射到0-1
exp = np.exp(x - x.max())
return exp / exp.sum()
def d_softmax(data): # 对softmax求导
sm = softmax(data)
# diag对角矩阵 outer外积
return np.diag(sm) - np.outer(sm, sm)
fd = {bypass: d_bypass, relu: d_relu, softmax: d_softmax} # 导数函数
d_type = {bypass: 'times', relu: 'times', softmax: 'dot'} # 类型字典 对矩阵样式取 点乘 或 叉乘
调参和变量
一个网络好不好很大程度看参调得怎样
dimensions = [28 * 28, 90, 10] # 每层神经元数量,除头尾层都可以调
activation = [bypass, relu, softmax] # 各层对应激活函数
distribution = [ # 先给给范围
{}, # 零层不用
{'b': [0, 0], # w取值范围 +-√(6/(784+90)) 一种分布,效果比较好 也可以(-1,1)等等
'w': [-math.sqrt(6 / (dimensions[0] + dimensions[1])), math.sqrt(6 / (dimensions[0] + dimensions[1]))]},
{'b': [0, 0],
'w': [-math.sqrt(6 / (dimensions[1] + dimensions[2])), math.sqrt(6 / (dimensions[1] + dimensions[2]))]}
]
learnRate = 10 ** -0.35 # 学习率
epochNum = 6 # 训练次数
oneHot = np.identity(dimensions[-1]) # 单位矩阵10*10
result = {} # 丹
batchSize = 100 # 每组数量 训练一组下降一次
train_num = 50000 # 训练集(练习题)
valid_num = 10000 # 验证集(小测题)
test_num = 10000 # 测试集(期末考)
# 训练日志
train_loss_list = []
train_accu_list = []
valid_loss_list = []
valid_accu_list = []
初始化和预测
def init_parameters_b(layer):
dist = distribution[layer]['b'] # [0,0] 范围
# 返回该层数量的随机矩阵,同时限制范围
return np.random.rand(dimensions[layer]) * (dist[1] - dist[0]) + dist[0]
def init_parameters_w(layer):
dist = distribution[layer]['w'] # w的矩阵大小由上下层数量决定
return np.random.rand(dimensions[layer - 1], dimensions[layer]) * (dist[1] - dist[0]) + dist[0]
def init_parameters(): # 初始化参数
parameter = [] # [{},{w:[784,90],b:[90]},{w:[90,10],b:[10]}]
for i in range(len(distribution)):
parameter_item = {}
for k in distribution[i].keys():
if k == 'b':
parameter_item['b'] = init_parameters_b(i)
elif k == 'w':
parameter_item['w'] = init_parameters_w(i)
parameter.append(parameter_item)
return parameter
result = init_parameters() # 初始化丹
def predict(img, parameters): # 对图像用参数预测a=σ(w*a上层+b)
l_in = img
l_out = activation[0](l_in)
for layer in range(1, len(dimensions)):
l_in = np.dot(l_out, parameters[layer]['w']) + parameters[layer]['b'] # [90,] = [784,]*[784,90]+[90,]
l_out = activation[layer](l_in)
return l_out # [10,]
读取训练的数据
mnist数据
一共四个,要解压
dataset_path = Path('./MNIST')
train_img_path = dataset_path / 'train-images.idx3-ubyte'
train_lab_path = dataset_path / 'train-labels.idx1-ubyte'
test_img_path = dataset_path / 't10k-images.idx3-ubyte'
test_lab_path = dataset_path / 't10k-labels.idx1-ubyte'
with open(train_img_path, 'rb') as f:
struct.unpack('>4i', f.read(16))
tmp_img = np.fromfile(f, dtype=np.uint8).reshape(-1, 28 * 28) / 255
train_img = tmp_img[:train_num]
valid_img = tmp_img[train_num:]
with open(train_lab_path, 'rb') as f:
struct.unpack('>2i', f.read(8))
tmp_lab = np.fromfile(f, dtype=np.uint8)
train_lab = tmp_lab[:train_num]
valid_lab = tmp_lab[train_num:]
with open(test_img_path, 'rb') as f:
struct.unpack('>4i', f.read(16))
test_img = np.fromfile(f, dtype=np.uint8).reshape(-1, 28 * 28) / 255
with open(test_lab_path, 'rb') as f:
struct.unpack('>2i', f.read(8))
test_lab = np.fromfile(f, dtype=np.uint8)
def show_train(index): # 查看图片
plt.imshow(train_img[index].reshape(28, 28), cmap='gray')
plt.show()
print('label: {}'.format(train_lab[index]))
def show_valid(index):
plt.imshow(valid_img[index].reshape(28, 28), cmap='gray')
plt.show()
print('label: {}'.format(valid_lab[index]))
def show_test(index):
plt.imshow(test_img[index].reshape(28, 28), cmap='gray')
plt.show()
print('label: {}'.format(test_lab[index]))
# show_train(np.random.randint(train_num))
# show_valid(np.random.randint(valid_num))
# show_test(np.random.randint(test_num))
loss和准确度
loss或者叫代价、损失、平方差,反映了对正确结果的偏离,这个越小越好
def sqr_loss(img, lab, parameters): # 平方差
y_pred = predict(img, parameters) # [0.1,0.1,0.1,...]
y = oneHot[lab] # lab为0时有,[1,0,0,...]
diff = y - y_pred # 各个数字的概率差值
return np.dot(diff, diff)
def train_loss(parameters): # 训练集的损失
loss_accu = 0
for img_i in range(train_num):
loss_accu += sqr_loss(train_img[img_i], train_lab[img_i], parameters)
return loss_accu / (train_num / 10000) # 使与valid(10000个)统一
def valid_loss(parameters):
loss_accu = 0
for img_i in range(valid_num):
loss_accu += sqr_loss(valid_img[img_i], valid_lab[img_i], parameters)
return loss_accu / (valid_num / 10000)
def train_accuracy(parameters): # 训练集准确度
correct = [predict(train_img[img_i], parameters).argmax() == train_lab[img_i] for img_i in range(train_num)]
return correct.count(True) / len(correct)
def valid_accuracy(parameters):
correct = [predict(valid_img[img_i], parameters).argmax() == valid_lab[img_i] for img_i in range(valid_num)]
return correct.count(True) / len(correct)
def test_accuracy(parameters): # 测试集准确度
correct = [predict(test_img[img_i], parameters).argmax() == test_lab[img_i] for img_i in range(test_num)]
return correct.count(True) / len(correct)
求梯度
要输出结果预测更准确,就要让loss的值更小。而它是一个函数,这里用求导来找最小值
公式推导
一条简单的公式 y = w*a + b
,其中w、b为参数,a为输入,y为输出
神经元的公式为 `y = σ(aw + b)`
这里w,b,a,y为矩阵,σ为激活函数(加入非线性因素)。公式和上面一样简单
( ) --> ( ) --> (.48) --> (.66) 期望值y=1
从一个简单的网络开始,关注最后两个神经元。最后一层aL = 0.66,上一层a(L-1) = 0.48。通过调整参数,使最后一个神经元的数值接近期望值。
关系如图(L代表L层,L-1为L-1层)
w(L-1) \ wL \ y \
a(L-2) - z(L-1) - a(L-1) - zL - aL - C
b(L-1) / bL /
Cost: C = (a^L-y)^2\\
a^L = σ(a^{L-1}*w^L + b^L)\\
令:z^L = a^{L-1}*w^L + b^L\\
有:a^L = σ(z^L)
w和b是要调整的\
求∂C/∂wL的最小值,先求导\
由上图关系易可得:
{\partial C\over \partial w^L} = {\partial z^L\over \partial w^L}{\partial a^L\over \partial z^L}{\partial C\over \partial a^L}
{\partial C\over \partial w^L}={\partial (a^{L-1}w^L+b^L)\over \partial w^L}{\partial(\sigma(z^L))\over \partial z^L} {\partial ((a^L-y)^2)\over \partial a^L}
得:{\partial C\over \partial w^L} = a^{L-1}\sigma'(z^L)(2(a^L-y))
同理:{\partial C\over \partial b^L} = 1*\sigma'(z^L)(2(a^L-y))
易得:
{\partial C\over \partial a^{L-1}} = w^L\sigma'(z^L){\partial C\over \partial a^L}
对a^L则:{\partial C\over \partial a^L} = w^{L+1}\sigma'(z^{L+1}){\partial C\over \partial a^{L+1}}
所以:
{\partial C\over \partial w^L} = a^{L-1}\sigma'(z^L){\partial C\over \partial a^L}
即:{\partial C\over \partial w^L} = a^{L-1}\sigma'(z^L)(w^{L+1}\sigma'(z^{L+1}){\partial C\over \partial a^{L+1}})
由上式可知前一层的∂C/∂w中的∂C/∂a可由后一层得出。最后一层∂C/∂a为2(aL−y),然后将它套入前一层的∂C/∂a,一直套就能把每层loss对的w和b的梯度算出来。对于组数据来说,要再求个累加
函数实现
def grad_parameters(img, lab, parameters):
l_in_list = [img] # 每层的各个神经元的数值激活前的值 z
l_out_list = [activation[0](img)] # 每层的各个神经元的值 a = σ(z)
for layer in range(1, len(dimensions)):
l_in = np.dot(l_out_list[layer - 1], parameters[layer]['w']) + parameters[layer]['b'] # z = wa+b
l_out = activation[layer](l_in) # a=σ(z) 激活函数
l_in_list.append(l_in)
l_out_list.append(l_out)
grad_result = [{}] * len(dimensions)
# 算出dC/dw = (a(L-1))(σ'(z))(2(a-y)) dC/db = (σ'(z))(2(a-y))
d_layer = 2 * (l_out_list[-1] - oneHot[lab]) # dC/da = 2(a-y)
for layer in range(len(dimensions) - 1, 0, -1): # 左闭右开 反向传播
if d_type[activation[layer]] == 'times':
d_layer = fd[activation[layer]](l_in_list[layer]) * d_layer # da/dz = σ'(z) times
elif d_type[activation[layer]] == 'dot':
d_layer = np.dot(fd[activation[layer]](l_in_list[layer]), d_layer) # da/dz = σ'(z) dot
grad_result[layer] = {}
grad_result[layer]['w'] = np.outer(l_out_list[layer - 1], d_layer) # dC/dw
grad_result[layer]['b'] = d_layer # dC/db
d_layer = np.dot(parameters[layer]['w'], d_layer) # 作上一层的 dC/da(L-1) = w*σ'(z)*dC/da
return grad_result
梯度下降
w = w - 学习率 X 梯度方向\
学习率 影响下降速度\
通过这个过程就能把w调到符合的数值
def grad_add(g1, g2): # 梯度加
for layer in range(1, len(g1)):
for pn in g1[layer].keys():
g1[layer][pn] += g2[layer][pn]
return g1
def grad_div(grad, denominator): # 梯度除
for layer in range(1, len(grad)):
for pn in grad[layer].keys():
grad[layer][pn] /= denominator
return grad
def train_batch(current_batch, parameters): # 每组的梯度方向
grad_accu = grad_parameters(train_img[current_batch * batchSize + 0], train_lab[current_batch * batchSize + 0],
parameters)
for img_i in range(1, batchSize):
grad_tmp = grad_parameters(train_img[current_batch * batchSize + img_i],
train_lab[current_batch * batchSize + img_i], parameters)
grad_add(grad_accu, grad_tmp) # 将 batchSize 个梯度相加
grad_div(grad_accu, batchSize) # 取平均
return grad_accu
def combine_parameters(parameters, grad, learn_rate): # 梯度下降 Pi -> Pi-k(grad C)
parameters_tmp = copy.deepcopy(parameters)
for layer in range(1, len(parameters_tmp)):
for pn in parameters_tmp[layer].keys():
parameters_tmp[layer][pn] -= learn_rate * grad[layer][pn]
return parameters_tmp
开始炼丹
def get_result():
current_epoch = 0 # 当前循环次数
train_accu_tmp = 0 # 临时变量 准确率
valid_accu_tmp = 0 # 临时变量 准确率
global result
print('\n初始验证准确率: {}'.format(valid_accuracy(result)))
for epoch in range(epochNum):
for i in tqdm(range(train_num // batchSize), desc='训练进度: {}/{}'.format(current_epoch // 500 + 1, epochNum)):
grad_tmp = train_batch(i, result) # 每组梯度
result = combine_parameters(result, grad_tmp, learnRate) # 梯度下降
current_epoch += 1
# 记录每训练完一次记录loss和accu的情况
with tqdm(total=4, desc='记录情况: {}/{}'.format(current_epoch // 500, epochNum)) as pb:
train_loss_list.append(train_loss(result))
pb.update(1)
valid_loss_list.append(valid_loss(result))
pb.update(1)
train_accu_tmp = train_accuracy(result)
train_accu_list.append(train_accu_tmp)
pb.update(1)
valid_accu_tmp = valid_accuracy(result)
valid_accu_list.append(valid_accu_tmp)
pb.update(1)
# 查看准确度
print('训练准确率: {}'.format(train_accu_tmp))
print('验证准确率: {}'.format(valid_accu_tmp))
print('测试准确率: {}'.format(test_accuracy(result)))
def show_plot():
lower = 0
plt.plot(train_loss_list[lower:], color='black', label='训练集准偏差')
plt.plot(valid_loss_list[lower:], color='red', label='验证集偏差')
plt.show()
plt.plot(train_accu_list[lower:], color='black', label='训练集准确度')
plt.plot(valid_accu_list[lower:], color='red', label='验证集准确度')
plt.show()
get_result() # 开始炼丹
show_plot() # 显示图表
初始验证准确率: 0.0913
训练进度: 1/6: 100%|██████| 500/500 [00:21<00:00, 23.24it/s]
记录情况: 1/6: 100%|██████| 4/4 [00:10<00:00, 2.73s/it]
训练进度: 2/6: 100%|██████| 500/500 [00:27<00:00, 18.30it/s]
记录情况: 2/6: 100%|██████| 4/4 [00:11<00:00, 2.80s/it]
训练进度: 3/6: 100%|██████| 500/500 [00:24<00:00, 20.03it/s]
记录情况: 3/6: 100%|██████| 4/4 [00:13<00:00, 3.31s/it]
训练进度: 4/6: 100%|██████| 500/500 [00:23<00:00, 21.02it/s]
记录情况: 4/6: 100%|██████| 4/4 [00:13<00:00, 3.28s/it]
训练进度: 5/6: 100%|██████| 500/500 [00:35<00:00, 14.08it/s]
记录情况: 5/6: 100%|██████| 4/4 [00:11<00:00, 2.99s/it]
训练进度: 6/6: 100%|██████| 500/500 [00:23<00:00, 20.97it/s]
记录情况: 6/6: 100%|██████| 4/4 [00:11<00:00, 2.94s/it]
训练准确率: 0.97618
验证准确率: 0.9684
测试准确率: 0.9671
进程已结束,退出代码0
尾巴
这里96%的正确率还行吧,可能调下学习率、训练次数能更高点。这次看了很多资料才弄出来这个东西,可能其中还有一些错误。对此内容也没有完全懂,但了解了神经网络的大概思想。神经网络很神奇,也很耗时,炼多还有点无趣。不过多多益善
另外在程序编写中要注意下矩阵形状,否者就会报错。对每个函数的测试也很重要。
还有一件事,论坛“█”进度条这个东西多于6个时就报违规内容。。。。。。
一篇笔记,仅供参考
参考
UP主:大野喵渣 3Blue1Brown