import os.path as osp
import numpy as np
import tensorlayerx as tlx
from gammagl.utils import coalesce
from gammagl.data import InMemoryDataset, download_url, Graph
[docs]
class WebKB(InMemoryDataset):
r"""The WebKB datasets used in the
`"Geom-GCN: Geometric Graph Convolutional Networks"
<https://openreview.net/forum?id=S1e2agrFvS>`_ paper.
Nodes represent web pages and edges represent hyperlinks between them.
Node features are the bag-of-words representation of web pages.
The task is to classify the nodes into one of the five categories, student,
project, course, staff, and faculty.
Parameters
----------
root: str, optional
Root directory where the dataset should be saved.
name: str
The name of the dataset. (:obj:`"Cornell"`,
:obj:`"Texas"`, :obj:`"Wisconsin"`)
transform: callable, optional
A function/transform that takes in an
:obj:`gammagl.data.Graph` object and returns a transformed
version. The data object will be transformed before every access.
(default: :obj:`None`)
pre_transform: callable, optional
A function/transform that takes in
an :obj:`gammagl.data.Graph` object and returns a
transformed version. The data object will be transformed before
being saved to disk. (default: :obj:`None`)
force_reload (bool, optional): Whether to re-process the dataset.
(default: :obj:`False`)
"""
url = 'https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/master'
def __init__(self, root=None, name='cornell', transform=None, pre_transform=None, force_reload: bool = False):
self.name = name.lower()
assert self.name in ['cornell', 'texas', 'wisconsin']
super().__init__(root, transform, pre_transform, force_reload = force_reload)
self.data, self.slices = self.load_data(self.processed_paths[0])
@property
def raw_dir(self):
return osp.join(self.root, self.name, 'raw')
@property
def processed_dir(self):
return osp.join(self.root, self.name, 'processed')
@property
def raw_file_names(self):
out = ['out1_node_feature_label.txt', 'out1_graph_edges.txt']
out += [f'{self.name}_split_0.6_0.2_{i}.npz' for i in range(10)]
return out
@property
def processed_file_names(self):
return tlx.BACKEND + '_data.pt'
[docs]
def download(self):
for f in self.raw_file_names[:2]:
download_url(f'{self.url}/new_data/{self.name}/{f}', self.raw_dir)
for f in self.raw_file_names[2:]:
download_url(f'{self.url}/splits/{f}', self.raw_dir)
[docs]
def process(self):
with open(self.raw_paths[0], 'r') as f:
data = f.read().split('\n')[1:-1]
x = [[float(v) for v in r.split('\t')[1].split(',')] for r in data]
x = np.array(x, dtype=np.float32)
y = [int(r.split('\t')[2]) for r in data]
y = np.array(y, dtype=np.int64)
with open(self.raw_paths[1], 'r') as f:
data = f.read().split('\n')[1:-1]
data = [[int(v) for v in r.split('\t')] for r in data]
edge_index = np.ascontiguousarray(np.array(data, dtype=np.int64).T)
edge_index = coalesce(edge_index)
train_masks, val_masks, test_masks = [], [], []
for f in self.raw_paths[2:]:
tmp = np.load(f)
train_masks += [tmp['train_mask'].astype(np.bool_)]
val_masks += [tmp['val_mask'].astype(np.bool_)]
test_masks += [tmp['test_mask'].astype(np.bool_)]
train_mask = np.concatenate(train_masks)
val_mask = np.concatenate(val_masks)
test_mask = np.concatenate(test_masks)
data = Graph(x=x, edge_index=edge_index, y=y, train_mask=train_mask,
val_mask=val_mask, test_mask=test_mask)
data = data if self.pre_transform is None else self.pre_transform(data)
self.save_data(self.collate([data]), self.processed_paths[0])
def __repr__(self) -> str:
return f'{self.name}()'