Source code for gammagl.utils.homophily

import tensorlayerx as tlx
import numpy as np
from gammagl.mpops import unsorted_segment_mean
from gammagl.utils import degree



[docs]
def homophily(edge_index, y, batch=None, method: str = 'edge'):
    r"""The homophily of a graph characterizes how likely nodes with the same 
    label are near each other in a graph.
    There are many measures of homophily that fits this definition.
    In particular:

    - In the `"Beyond Homophily in Graph Neural Networks: Current Limitations
      and Effective Designs" <https://arxiv.org/abs/2006.11468>`_ paper, the
      homophily is the fraction of edges in a graph which connects nodes
      that have the same class label:

      .. math::
         \frac{| \{ (v,w) : (v,w) \in \mathcal{E} \wedge y_v = y_w \} | }
         {| \mathcal{E}|}

      That measure is called the *edge homophily ratio*.

    - In the `"Geom-GCN: Geometric Graph Convolutional Networks"
      <https://arxiv.org/abs/2002.05287>`_ paper, edge homophily is normalized
      across neighborhoods:

      .. math::
         \frac{1}{| \mathcal{V}|} \sum_{v \in \mathcal{V}} \frac{ | \{ (w,v) : w
         \in \mathcal{N}(v) \wedge y_v = y_w \} |  } { | \mathcal{N}(v)| }

      That measure is called the *node homophily ratio*.

    - In the `"Large-Scale Learning on Non-Homophilous Graphs: New Benchmarks
      and Strong Simple Methods" <https://arxiv.org/abs/2110.14446>`_ paper,
      edge homophily is modified to be insensitive to the number of classes
      and size of each class:

      .. math::
         \frac{1}{C-1} \sum_{k=1}^{C} \max \left(0, h_k - \frac{| \mathcal{C}_k|}
         {| \mathcal{V}|} \right)

      where :math:`C` denotes the number of classes, :math:`| \mathcal{C}_k|`
      denotes the number of nodes of class :math:`k`, and :math:`h_k` denotes
      the edge homophily ratio of nodes of class :math:`k`.
      Thus, that measure is called the *class insensitive edge homophily
      ratio*.

    Parameters
    ----------
    edge_index: tensor
        The graph connectivity.
    y: tensor
        The labels.
    batch: tensor, optional
        Batch vector\
        :math:`\mathbf{b} \in {\{ 0, \ldots,B-1\}}^N`, which assigns
        each node to a specific example. (default: :obj:`None`)
    method: str, optional
        The method used to calculate the homophily,
        either :obj:`"edge"` (first formula), :obj:`"node"` (second
        formula) or :obj:`"edge_insensitive"` (third formula).
        (default: :obj:`"edge"`)

    Examples:
        >>> edge_index = tlx.convert_to_tensor([[0, 1, 2, 3],
        ...                            [1, 2, 0, 4]])
        >>> y = tlx.convert_to_tensor([0, 0, 0, 0, 1])
        >>> # Edge homophily ratio
        >>> homophily(edge_index, y, method='edge')
        0.75
        >>> # Node homophily ratio
        >>> homophily(edge_index, y, method='node')
        0.6000000238418579
        >>> # Class insensitive edge homophily ratio
        >>> homophily(edge_index, y, method='edge_insensitive')
        0.19999998807907104

    """
    assert method in {'edge', 'node', 'edge_insensitive'}
    if tlx.is_tensor(edge_index):
        edge_index = tlx.convert_to_numpy(edge_index)
    if tlx.is_tensor(y):
        y = tlx.convert_to_numpy(y)
    y = np.squeeze(y, axis=-1) if y.ndim > 1 else y

    row, col = edge_index[0], edge_index[1]

    if method == 'edge':
        out = np.zeros(row.shape[0])
        out[y[row] == y[col]] = 1.
        if batch is None:
            return float(out.mean())
        else:
            out = tlx.convert_to_tensor(out, dtype=tlx.float32)
            col = tlx.convert_to_tensor(col)
            batch = tlx.convert_to_tensor(batch)
            return unsorted_segment_mean(out, tlx.gather(batch, col))

    elif method == 'node':
        out = np.zeros(row.shape[0])
        out[y[row] == y[col]] = 1.

        out = unsorted_segment_mean(tlx.convert_to_tensor(out, dtype=tlx.float32), tlx.convert_to_tensor(col))
        print(out)
        if batch is None:
            return float(tlx.reduce_mean(out))
        else:
            return unsorted_segment_mean(out, batch)

    elif method == 'edge_insensitive':
        assert y.ndim == 1
        num_classes = int(y.max()) + 1
        assert num_classes >= 2
        if batch is None:
            batch = np.zeros_like(y)
        elif tlx.is_tensor(batch):
            batch = tlx.convert_to_numpy(batch)
        num_nodes = tlx.convert_to_numpy(degree(batch))
        num_graphs = num_nodes.size
        batch = num_classes * batch + y

        h = tlx.convert_to_numpy(homophily(edge_index, y, batch, method='edge'))
        h = np.reshape(h, [num_graphs, num_classes])

        counts = np.bincount(batch, minlength=num_classes * num_graphs)
        counts = np.reshape(counts, [num_graphs, num_classes])
        proportions = counts / np.reshape(num_nodes, [-1, 1])

        out = np.clip((h - proportions), a_min=0, a_max=None).sum(axis=-1)
        out /= num_classes - 1
        return out if out.size > 1 else float(out)

    else:
        raise NotImplementedError