Source code for mindspore_gl.dataset.cora

# Copyright 2022 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""CoraV2"""
import os
import pickle as pkl
import numpy as np
import networkx as nx
import scipy.sparse as sp
from scipy.sparse import coo_matrix, csr_matrix
from mindspore_gl.graph.graph import MindHomoGraph, CsrAdj


[docs]class CoraV2: r""" Cora Dataset, a source dataset for reading and parsing Cora dataset. Args: root(str): path to the root directory that contains cora_v2_with_mask.npz. Raises: RuntimeError: If root does not contain data files. Examples: >>> from mindspore_gl.dataset import CoraV2 >>> root = "path/to/cora_v2_with_mask.npz" >>> dataset = CoraV2(root) About Cora dataset: The Cora dataset consists of 2708 scientific publications classified into one of seven classes. The citation network consists of 10556 links. Each publication in the dataset is described by a 0/1-valued word vector indicating the absence/presence of the corresponding word from the dictionary. The dictionary consists of 1433 unique words. Statistics: - Nodes: 2708 - Edges: 10556 - Number of Classes: 7 - Label split: - Train: 140 - Valid: 500 - Test: 1000 Dataset can be download here: <https://github.com/kimiyoung/planetoid> You can organize the dataset files into the following directory structure and read by `process` API. .. code-block:: . └── corav2 ├── ind.cora_v2.allx ├── ind.cora_v2.ally ├── ind.cora_v2.graph ├── ind.cora_v2.test.index ├── ind.cora_v2.tx ├── ind.cora_v2.ty ├── ind.cora_v2.x └── ind.cora_v2.y """ def __init__(self, root): if not isinstance(root, str): raise TypeError(f"For '{self.cls_name}', the 'root' should be a str, " f"but got {type(root)}.") self._root = root self._path = os.path.join(root, 'cora_v2_with_mask.npz') self._csr_row = None self._csr_col = None self._nodes = None self._node_feat = None self._node_label = None self._train_mask = None self._val_mask = None self._test_mask = None self._npz_file = None if os.path.exists(self._path) and os.path.isfile(self._path): self.load() elif os.path.exists(self._root): self.preprocess() self.load() else: raise Exception('data file does not exist')
[docs] def preprocess(self): """Download and process data""" names = ['y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] dataset_str = 'cora_v2' for name in names: try: with open("{}/ind.{}.{}".format(self._root, dataset_str, name), 'rb') as f: objects.append(pkl.load(f, encoding='latin1')) except IOError as e: raise e y, tx, ty, allx, ally, graph = tuple(objects) test_idx_reorder = _parse_index_file("{}/ind.{}.test.index".format(self._root, dataset_str)) test_idx_range = np.sort(test_idx_reorder) features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] features = _normalize_cora_features(features) graph = nx.Graph(nx.from_dict_of_lists(graph)) graph = graph.to_directed() onehot_labels = np.vstack((ally, ty)) onehot_labels[test_idx_reorder, :] = onehot_labels[test_idx_range, :] labels = np.argmax(onehot_labels, 1) adj_coo_row = [] adj_coo_col = [] line_count = 0 for e in graph.edges: adj_coo_row.append(e[0]) adj_coo_col.append(e[1]) line_count += 1 for i in range(len(labels)): adj_coo_row.append(i) adj_coo_col.append(i) num_nodes = len(labels) num_edges = len(adj_coo_row) idx_test = test_idx_range.tolist() idx_train = range(len(y)) idx_val = range(len(y), len(y) + 500) train_mask = _sample_mask(idx_train, num_nodes) val_mask = _sample_mask(idx_val, num_nodes) test_mask = _sample_mask(idx_test, num_nodes) adj_coo_matrix = coo_matrix((np.ones(len(adj_coo_row), dtype=bool), (adj_coo_row, adj_coo_col)), shape=(num_nodes, num_nodes)) out_degrees = np.sum(adj_coo_matrix, axis=1) in_degrees = np.sum(adj_coo_matrix, axis=0) adj_csr_matrix = adj_coo_matrix.tocsr() np.savez(self._path, feat=features, label=labels, test_mask=test_mask, train_mask=train_mask, val_mask=val_mask, adj_coo_row=adj_coo_row, adj_coo_col=adj_coo_col, adj_csr_indptr=adj_csr_matrix.indptr, adj_csr_indices=adj_csr_matrix.indices, in_degrees=in_degrees, out_degrees=out_degrees, adj_csr_data=adj_csr_matrix.data, n_edges=num_edges, n_nodes=num_nodes, n_classes=onehot_labels.shape[1])
[docs] def load(self): """Load the saved npz dataset from files.""" self._npz_file = np.load(self._path) self._csr_row = self._npz_file['adj_csr_indptr'].astype(np.int32) self._csr_col = self._npz_file['adj_csr_indices'].astype(np.int32) self._nodes = np.array(list(range(len(self._csr_row) - 1)))
@property def num_features(self): """ Feature size of each node Returns: int, the number of feature size Examples: >>> #dataset is an instance object of Dataset >>> num_features = dataset.num_features() """ return self.node_feat.shape[1] @property def num_classes(self): """ Number of label classes Returns: int, the number of classes Examples: >>> #dataset is an instance object of Dataset >>> num_classes = dataset.num_classes() """ return len(np.unique(self.node_label)) @property def train_mask(self): """ Mask of training nodes Returns: numpy.ndarray, array of mask Examples: >>> #dataset is an instance object of Dataset >>> train_mask = dataset.train_mask() """ if self._train_mask is None: self._train_mask = self._npz_file['train_mask'] return self._train_mask @property def test_mask(self): """ Mask of test nodes Returns: numpy.ndarray, array of mask Examples: >>> #dataset is an instance object of Dataset >>> test_mask = dataset.test_mask() """ if self._test_mask is None: self._test_mask = self._npz_file['test_mask'] return self._test_mask @property def val_mask(self): """ Mask of validation nodes Returns: numpy.ndarray, array of mask Examples: >>> #dataset is an instance object of Dataset >>> val_mask = dataset.val_mask() """ if self._val_mask is None: self._val_mask = self._npz_file['val_mask'] return self._val_mask @property def train_nodes(self): """ training nodes indexes Returns: numpy.ndarray, array of training nodes Examples: >>> #dataset is an instance object of Dataset >>> train_nodes = dataset.train_nodes() """ return (np.nonzero(self.train_mask)[0]).astype(np.int32) @property def node_count(self): """ Number of nodes Returns: int, length of csr row Examples: >>> #dataset is an instance object of Dataset >>> node_count = dataset.node_count() """ return len(self._csr_row) @property def edge_count(self): """ Number of edges Returns: int, length of csr col Examples: >>> #dataset is an instance object of Dataset >>> edge_count = dataset.edge_count() """ return len(self._csr_col) @property def node_feat(self): """ Node features Returns: numpy.ndarray, array of node feature Examples: >>> #dataset is an instance object of Dataset >>> node_feat = dataset.node_feat() """ if self._node_feat is None: self._node_feat = self._npz_file["feat"] return self._node_feat @property def node_label(self): """ Ground truth labels of each node Returns: numpy.ndarray, array of node label Examples: >>> #dataset is an instance object of Dataset >>> node_label = dataset.node_label() """ if self._node_label is None: self._node_label = self._npz_file["label"] return self._node_label.astype(np.int32) @property def adj_coo(self): """ Return the adjacency matrix of COO representation Returns: numpy.ndarray, array of coo matrix. Examples: >>> #dataset is an instance object of Dataset >>> node_label = dataset.adj_coo() """ return csr_matrix((np.ones(self._csr_col.shape), self._csr_col, self._csr_row)).tocoo(copy=False) @property def adj_csr(self): """ Return the adjacency matrix of CSR representation. Returns: numpy.ndarray, array of csr matrix. Examples: >>> #dataset is an instance object of Dataset >>> node_label = dataset.adj_csr() """ return csr_matrix((np.ones(self._csr_col.shape), self._csr_col, self._csr_row)) def __getitem__(self, idx): assert idx == 0, "Cora only has one graph" graph = MindHomoGraph() node_dict = {idx: idx for idx in range(self.node_count)} edge_ids = np.array(list(range(self.edge_count))).astype(np.int32) graph.set_topo(CsrAdj(self._csr_row, self._csr_col), node_dict=node_dict, edge_ids=edge_ids) return graph
def _parse_index_file(filename): """Parse index file.""" index = [] for line in open(filename): index.append(int(line.strip())) return index def _normalize_cora_features(features): row_sum = np.array(features.sum(1)) r_inv = np.power(row_sum * 1.0, -1).flatten() r_inv[np.isinf(r_inv)] = 0. r_mat_inv = sp.diags(r_inv) features = r_mat_inv.dot(features) return np.asarray(features.todense()) def _sample_mask(idx, l): """Create mask.""" mask = np.zeros(l, dtype=bool) mask[idx] = True return mask