Source code for gammagl.models.gen

import numpy as np
import tensorlayerx as tlx
from collections import Counter
from scipy.sparse import coo_matrix
from gammagl.utils import homophily, mask_to_index



[docs]
class GEstimationN():
    r"""Provide adjacency matrix estimation implementation based on the Expectation-Maximization(EM) algorithm.

        Parameters
        ----------
        E: numpy.array
        The actual observed number of edges between every pair of nodes.

    """

    def __init__(self, data):
        self.num_class = data.num_classes
        graph = data[0]
        self.num_node = graph.num_nodes
        self.idx_train = tlx.convert_to_numpy(mask_to_index(graph.train_mask))
        self.label = tlx.convert_to_numpy(graph.y)
        row = graph.edge_index[0]
        col = graph.edge_index[1]
        data = np.ones(len(row))
        if tlx.BACKEND == 'torch':
            row = tlx.convert_to_numpy(row)
            col = tlx.convert_to_numpy(col)
        self.adj = coo_matrix((data, (row, col)), shape=(self.num_node, self.num_node)).toarray()

        self.output = None
        self.iterations = 0

        self.homophily = homophily(graph.edge_index, graph.y, method='node')


[docs]
    def reset_obs(self):
        self.N = 0
        self.E = np.zeros((self.num_node, self.num_node), dtype=np.int64)



[docs]
    def update_obs(self, output):
        self.E += output
        self.N += 1



[docs]
    def revise_pred(self):
        # revise the prediction with train label
        # self.output = tlx.scatter_update(self.output, self.idx_train, tlx.gather(self.label, self.idx_train))
        self.output[self.idx_train] = self.label[self.idx_train]



[docs]
    def E_step(self, Q):
        r"""
            Run the Expectation(E) step of the EM algorithm.

            Parameters
            ----------
            Q: numpy.array
                The current estimation that each edge is actually present

            Returns
            ----------
            alpha: float
                The estimation of true-positive rate
            beta: float
                The estimation of false-positive rate
            O: numpy.array
                The estimation of network model parameters
        """
        # Temporary variables to hold the numerators and denominators of alpha and beta
        an = Q * self.E
        an = np.triu(an, 1).sum()
        bn = (1 - Q) * self.E
        bn = np.triu(bn, 1).sum()
        ad = Q * self.N
        ad = np.triu(ad, 1).sum()
        bd = (1 - Q) * self.N
        bd = np.triu(bd, 1).sum()

        # Calculate alpha, beta
        alpha = an * 1. / (ad)
        beta = bn * 1. / (bd)

        O = np.zeros((self.num_class, self.num_class))

        n = []
        counter = Counter(self.output)
        for i in range(self.num_class):
            n.append(counter[i])

        a = self.output.repeat(self.num_node).reshape(self.num_node, -1)
        for j in range(self.num_class):
            c = (a == j)
            for i in range(j + 1):
                b = (a == i)
                O[i, j] = np.triu((b & c.T) * Q, 1).sum()
                if i == j:
                    O[j, j] = 2. / (n[j] * (n[j] - 1)) * O[j, j]
                else:
                    O[i, j] = 1. / (n[i] * n[j]) * O[i, j]
        return (alpha, beta, O)



[docs]
    def M_step(self, alpha, beta, O):
        """Run the Maximization(M) step of the EM algorithm.
        """
        O += O.T - np.diag(O.diagonal())

        row = self.output.repeat(self.num_node)
        col = np.tile(self.output, self.num_node)
        tmp = O[row, col].reshape(self.num_node, -1)

        p1 = tmp * np.power(alpha, self.E) * np.power(1 - alpha, self.N - self.E)
        p2 = (1 - tmp) * np.power(beta, self.E) * np.power(1 - beta, self.N - self.E)
        Q = p1 * 1. / (p1 + p2 * 1.)
        return Q




[docs]
    def EM(self, output, tolerance=.000001):
        r"""
            Run the complete EM algorithm.

            Parameters
            ----------
            tolerance: float
                Determine the tolerance in the variantions of alpha, beta and O, which is acceptable to stop iterating
            seed: int
                seed for np.random.seed

            Returns
            ----------
            iterations: int
                The number of iterations to achieve the tolerance on the parameters
        """
        # Record previous values to confirm convergence
        alpha_p = 0
        beta_p = 0

        self.output = tlx.convert_to_numpy(output)
        self.revise_pred()

        # Do an initial E-step with random alpha, beta and O
        # Beta must be smaller than alpha
        beta, alpha = np.sort(np.random.rand(2))
        O = np.triu(np.random.rand(self.num_class, self.num_class))

        # Calculate initial Q
        Q = self.M_step(alpha, beta, O)

        while abs(alpha_p - alpha) > tolerance or abs(beta_p - beta) > tolerance:
            alpha_p = alpha
            beta_p = beta
            alpha, beta, O = self.E_step(Q)
            Q = self.M_step(alpha, beta, O)
            self.iterations += 1

        if self.homophily > 0.5:
            Q += self.adj
        return (alpha, beta, O, Q, self.iterations)