Source code for gammagl.datasets.polblogs

import numpy as np
import scipy.sparse as sp
import tensorlayerx as tlx
import os
import pandas as pd
from typing import Callable, List, Optional
from gammagl.data import download_url, InMemoryDataset, Graph, extract_tar



[docs]
class PolBlogs(InMemoryDataset):
    r"""The Political Blogs dataset from the `"The Political Blogosphere and
    the 2004 US Election: Divided they Blog"
    <https://dl.acm.org/doi/10.1145/1134271.1134277>`_ paper.

    :class:`Polblogs` is a graph with 1,490 vertices (representing political
    blogs) and 19,025 edges (links between blogs).
    The links are automatically extracted from a crawl of the front page of the
    blog.
    Each vertex receives a label indicating the political leaning of the blog:
    liberal or conservative.

    Parameters
    ----------
    root: str, optional
        Root directory where the dataset should be saved.
    transform: callable, optional
        A function/transform that takes in an
        :obj:`torch_geometric.data.Data` object and returns a transformed
        version. The data object will be transformed before every access.
        (default: :obj:`None`)
    pre_transform: callable, optional
        A function/transform that takes in
        an :obj:`torch_geometric.data.Data` object and returns a
        transformed version. The data object will be transformed before
        being saved to disk. (default: :obj:`None`)
    force_reload (bool, optional): Whether to re-process the dataset.
        (default: :obj:`False`)


    **STATS:**

    .. list-table::
        :widths: 10 10 10 10
        :header-rows: 1

        * - #nodes
          - #edges
          - #features
          - #classes
        * - 1,490
          - 19,025
          - 0
          - 2
    """

    url = 'https://netset.telecom-paris.fr/datasets/polblogs.tar.gz'

    def __init__(self, root: str = None, transform: Optional[Callable] = None,
                 pre_transform: Optional[Callable] = None, force_reload: bool = False):
        super().__init__(root, transform, pre_transform, force_reload = force_reload)
        self.data, self.slices = self.load_data(self.processed_paths[0])

    @property
    def raw_file_names(self) -> List[str]:
        return ['adjacency.tsv', 'labels.tsv']

    @property
    def processed_file_names(self) -> str:
        return tlx.BACKEND + '_data.pt'


[docs]
    def download(self):
        path = download_url(self.url, self.raw_dir)
        extract_tar(path, self.raw_dir)
        os.unlink(path)



[docs]
    def process(self):
        edge_index = pd.read_csv(self.raw_paths[0], header=None, sep='\t',
                                 usecols=[0, 1])
        edge_index = np.ascontiguousarray(np.array(edge_index, dtype=np.int64).T)
        adj = sp.csr_matrix(np.eye(1490))
        x = adj.todense()
        y = pd.read_csv(self.raw_paths[1], header=None, sep='\t')
        y = tlx.reshape(tlx.convert_to_tensor(y.values), (-1,))

        data = Graph(x=x, edge_index=edge_index, y=y)

        if self.pre_transform is not None:
            data = self.pre_transform(data)
        self.save_data(self.collate([data]), self.processed_paths[0])