一.前言
先说一下今天用到的数据集,虽然之前有文章讲过,但是这次可以讲的更透彻一些,温故而知新嘛:
Cora数据集,其实就是类似于卷积神经网络的mnist数据集。
Planetoid 是一个流行的节点分类数据集,
常用于图神经网络(GNN)的研究。
Cora 数据集包含科学出版物及其引文关系。
再说一下今天的环境和包:
import torch #可以是cpu也可以是cuda
import torch_geometric
关于torch_geometric的安装可以问Tom,这里就不赘述了。
二.从头开始
从加载数据集开始:
from torch_geometric.datasets import Planetoid
dataset = Planetoid(root="D:\data", name="Cora")
dataset
输出:
数据集属性展示:
有些朋友可能会问,这个第一个数据是啥意思呢?好的,请听我娓娓道来:
所以dataset[0]其实就是第一条数据,里面分别包含了x(节点特征矩阵),edge_index(这张图边的索引),y(类别),其他的mask分别用在了训练,验证和测试的阶段中。
那么我们接下来可以看看到底有多少节点用在了训练中:
dataset[0].train_mask.count_nonzero()
这行代码 data[0].train_mask.count_nonzero() 实际上是在统计在 Cora 数据集的训练集掩码中非零元素的数量。在图神经网络中,通常会将数据集分为训练集、验证集和测试集,掩码是用来标识哪些节点属于训练集的。因此,这行代码的作用是计算训练集中被标记为真的节点数量,这些节点将用于训练模型。
输出:
也就是说有140个节点用在了训练的阶段。
创建dataloader:
from torch_geometric.loader import DataLoader
BATCH_SIZE = 32
train_loader = DataLoader(dataset, batch_size=BATCH_SIZE)
val_loader = DataLoader(dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(dataset, batch_size=BATCH_SIZE)
紧接着,创建我们的model:
这个神经网络模型 NodeClassifier
的特点和优点如下:
Cora数据集是一个常用的文献引用网络数据集,包含多个类别的科研论文节点和它们之间的引用关系。使用 NodeClassifier
模型处理Cora数据集的优点包括:
NodeClassifier
模型通过多层的图卷积操作能够有效地处理这种复杂性,提高了分类任务的准确性和效率。总之,这个模型在处理Cora数据集时利用了图神经网络的优势,能够有效地解决文献分类和节点属性预测等任务。
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
class NodeClassifier(nn.Module):
def __init__(self, input_dim, num_graph_layers, hidden_dim, output_dim, dropout_pct):
super(NodeClassifier, self).__init__()
self.input_dim = input_dim
self.hidden_dim = hidden_dim
self.num_graph_layers = num_graph_layers
self.convs = nn.ModuleList()
self.convs.append(GCNConv(input_dim, hidden_dim))
for i in range(num_graph_layers - 1):
self.convs.append(GCNConv(hidden_dim, hidden_dim))
self.dropout_pct = dropout_pct
self.clf_head = nn.Sequential(
nn.Linear(hidden_dim, hidden_dim),
nn.Dropout(dropout_pct),
nn.Linear(hidden_dim, output_dim)
)
def forward(self, data):
x, edge_index, batch = data.x, data.edge_index, data.batch
for i in range(self.num_graph_layers):
x = self.convs[i](x, edge_index)
x = F.relu(x)
x = F.dropout(x, p=self.dropout_pct)
x = self.clf_head(x)
return F.log_softmax(x, dim=1)
def loss(self, pred, label):
return F.nll_loss(pred, label)
关于这段代码,注释已经写好了markdown,所以大家可以直接看下图的解释。
然后可以查看一下模型:
model_test = NodeClassifier(dataset.num_features, 3, 256, dataset.num_classes, 0.5)
for batch in train_loader:
print("batch:", batch)
pred = model_test(batch)
break
print("pred:", pred.size())
输出:
那接下来就是我们的training啦:
def train_step(model, optimizer, train_loader, device):
model.train()
total_rows, total_loss, total_correct = 0, 0, 0
for batch in train_loader:
batch = batch.to(device)
optimizer.zero_grad()
pred = model(batch)[batch.train_mask]
label = batch.y[batch.train_mask]
loss = model.loss(pred, label)
loss.backward()
optimizer.step()
total_loss += loss.item()
total_correct += pred.argmax(dim=1).eq(label).sum().item()
total_rows += torch.sum(batch.train_mask).item()
return total_loss / total_rows, total_correct / total_rows
def eval_step(model, eval_loader, device, is_validation=False):
model.eval()
total_rows, total_loss, total_correct = 0, 0, 0
for batch in eval_loader:
batch = batch.to(device)
mask = batch.val_mask if is_validation else batch.test_mask
with torch.no_grad():
pred = model(batch)[mask]
label = batch.y[mask]
loss = model.loss(pred, label)
total_loss += loss.item()
total_correct += pred.argmax(dim=1).eq(label).sum().item()
total_rows += torch.sum(mask).item()
return total_loss / total_rows, total_correct / total_rows
def train_loop(model, optimizer, train_loader, val_loader, device,
num_epochs, log_every=50):
history = []
for epoch in range(num_epochs):
train_loss, train_acc = train_step(model, optimizer, train_loader, device)
val_loss, val_acc = eval_step(model, val_loader, device, is_validation=True)
history.append((train_loss, train_acc, val_loss, val_acc))
if epoch == 0 or (epoch + 1) % log_every == 0:
print("EPOCH {:3d}, TRAIN loss: {:.5f}, acc: {:.5f}, VAL loss: {:.5f}, acc: {:.5f}"
.format(epoch + 1, train_loss, train_acc, val_loss, val_acc))
return history
指定一些超参:
# model parameters
INPUT_DIM = dataset.num_features
HIDDEN_DIM = 64
OUTPUT_DIM = dataset.num_classes
NUM_GCN_LAYERS = 3
DROPOUT_PCT = 0.5
# optimizer
LEARNING_RATE = 5e-4
WEIGHT_DECAY = 5e-3
NUM_EPOCHS = 500
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
建立model
model = NodeClassifier(INPUT_DIM, NUM_GCN_LAYERS, HIDDEN_DIM, OUTPUT_DIM, DROPOUT_PCT)
model = model.to(device)
创建优化器
import torch.optim as optim
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
optimizer
输出:
开始train:
history = train_loop(model, optimizer, train_loader, val_loader, device, NUM_EPOCHS)
画图:
import matplotlib.pyplot as plt
import numpy as np
def display_training_plots(history):
train_losses, train_accs, val_losses, val_accs = [], [], [], []
for train_loss, train_acc, val_loss, val_acc in history:
train_losses.append(train_loss)
train_accs.append(train_acc)
val_losses.append(val_loss)
val_accs.append(val_acc)
xs = np.arange(len(train_losses))
plt.figure(figsize=(10, 5))
plt.subplot(2, 1, 1)
plt.plot(xs, train_losses, label="train")
plt.plot(xs, val_losses, label="validation")
plt.xlabel("iterations")
plt.ylabel("loss")
plt.legend(loc="best")
plt.subplot(2, 1, 2)
plt.plot(xs, train_accs, label="train")
plt.plot(xs, val_accs, label="validation")
plt.xlabel("iterations")
plt.ylabel("accuracy")
plt.legend(loc="best")
_ = plt.show()
display_training_plots(history)
最后的test
_, test_acc = eval_step(model, test_loader, device)
print("Accuracy on test set: {:.5f}".format(test_acc))
输出:
除了使用GCNConv模块,我们也给出了另外两个模型的版本:
GAT version:
from torch_geometric.nn import GATConv
class NodeClassifierGAT(nn.Module):
def __init__(self, input_dim, num_graph_layers, hidden_dim, output_dim, dropout_pct):
super(NodeClassifierGAT, self).__init__()
self.input_dim = input_dim
self.hidden_dim = hidden_dim
self.num_graph_layers = num_graph_layers
self.convs = nn.ModuleList()
self.convs.append(GATConv(input_dim, hidden_dim))
for i in range(num_graph_layers - 1):
self.convs.append(GATConv(hidden_dim, hidden_dim))
self.dropout_pct = dropout_pct
self.clf_head = nn.Sequential(
nn.Linear(hidden_dim, hidden_dim),
nn.Dropout(dropout_pct),
nn.Linear(hidden_dim, output_dim)
)
def forward(self, data):
x, edge_index, batch = data.x, data.edge_index, data.batch
for i in range(self.num_graph_layers):
x = self.convs[i](x, edge_index)
x = F.relu(x)
x = F.dropout(x, p=self.dropout_pct)
x = self.clf_head(x)
return F.log_softmax(x, dim=1)
def loss(self, pred, label):
return F.nll_loss(pred, label)
model_gat = NodeClassifierGAT(INPUT_DIM, NUM_GCN_LAYERS, HIDDEN_DIM, OUTPUT_DIM, DROPOUT_PCT)
model_gat = model_gat.to(device)
test:
GraphSAGE version
from torch_geometric.nn import SAGEConv
class NodeClassifierSAGE(nn.Module):
def __init__(self, input_dim, num_graph_layers, hidden_dim, output_dim, dropout_pct):
super(NodeClassifierSAGE, self).__init__()
self.input_dim = input_dim
self.hidden_dim = hidden_dim
self.num_graph_layers = num_graph_layers
self.convs = nn.ModuleList()
self.convs.append(SAGEConv(input_dim, hidden_dim))
for i in range(num_graph_layers - 1):
self.convs.append(SAGEConv(hidden_dim, hidden_dim))
self.dropout_pct = dropout_pct
self.clf_head = nn.Sequential(
nn.Linear(hidden_dim, hidden_dim),
nn.Dropout(dropout_pct),
nn.Linear(hidden_dim, output_dim)
)
def forward(self, data):
x, edge_index, batch = data.x, data.edge_index, data.batch
for i in range(self.num_graph_layers):
x = self.convs[i](x, edge_index)
x = F.relu(x)
x = F.dropout(x, p=self.dropout_pct)
x = self.clf_head(x)
return F.log_softmax(x, dim=1)
def loss(self, pred, label):
return F.nll_loss(pred, label)
model_sage = NodeClassifierSAGE(INPUT_DIM, NUM_GCN_LAYERS, HIDDEN_DIM, OUTPUT_DIM, DROPOUT_PCT)
model_sage = model_sage.to(device)
test: