Skip to content

Summary Statistics Imputation

Summary statistics imputation leverages linkage disequilibrium (LD) information to compute Z-scores of missing SNPs from neighbouring observed SNPs SNPs by taking advantage of the Linkage Disequilibrium.

We implemented the basic model from RAISS (Robust and Accurate Imputation from Summary Statistics) package (see the original paper).

The full repository for the RAISS package can be found here.

The original model was suggested in 2014 by Bogdan Pasaniuc et al. here.

It represents the following formula:

E(zi|z_t) = M{i,t} \cdot (M_{t,t})^{-1} \cdot z_t

Where:

  • E(z_i|z_t) represents the expected z-score of SNP 'i' given the observed z-scores at known SNP indexes 't'.

  • M_{i,t} represents the LD (Linkage Disequilibrium) matrix between SNP 'i' and the known SNPs at indexes 't'.

  • (M_{t,t})^{-1} represents the inverse of the LD matrix of the known SNPs at indexes 't'.

  • z_t represents the vector of observed z-scores at the known SNP indexes 't'.

gentropy.method.sumstat_imputation.SummaryStatisticsImputation

Implementation of RAISS summary statstics imputation model.

Source code in src/gentropy/method/sumstat_imputation.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
class SummaryStatisticsImputation:
    """Implementation of RAISS summary statstics imputation model."""

    @staticmethod
    def raiss_model(
        z_scores_known: np.ndarray,
        ld_matrix_known: np.ndarray,
        ld_matrix_known_missing: np.ndarray,
        lamb: float = 0.01,
        rtol: float = 0.01,
    ) -> dict[str, Any]:
        """Compute the imputation of the z-score using the RAISS model.

        Args:
            z_scores_known (np.ndarray): the vector of known Z scores
            ld_matrix_known (np.ndarray) : the matrix of known LD correlations
            ld_matrix_known_missing (np.ndarray): LD matrix of known SNPs with other unknown SNPs in large matrix (similar to `ld[unknowns, :][:,known]`)
            lamb (float): size of the small value added to the diagonal of the covariance matrix before inversion. Defaults to 0.01.
            rtol (float): threshold to filter eigenvectos by its eigenvalue. It makes an inversion biased but much more numerically robust. Default to 0.01.

        Returns:
            dict[str, Any]:
                - var (np.ndarray): variance of the imputed SNPs
                - mu (np.ndarray): the estimation of the zscore of the imputed SNPs
                - ld_score (np.ndarray): the linkage disequilibrium score of the imputed SNPs
                - condition_number (np.ndarray): the condition number of the correlation matrix
                - correct_inversion (np.ndarray): a boolean array indicating if the inversion was successful
                - imputation_r2 (np.ndarray): the R2 of the imputation
        """
        sig_t_inv = SummaryStatisticsImputation._invert_sig_t(
            ld_matrix_known, lamb, rtol
        )
        if sig_t_inv is None:
            return {
                "var": None,
                "mu": None,
                "ld_score": None,
                "condition_number": None,
                "correct_inversion": None,
                "imputation_r2": None,
            }
        else:
            condition_number = np.array(
                [np.linalg.cond(ld_matrix_known)] * ld_matrix_known_missing.shape[0]
            )
            correct_inversion = np.array(
                [
                    SummaryStatisticsImputation._check_inversion(
                        ld_matrix_known, sig_t_inv
                    )
                ]
                * ld_matrix_known_missing.shape[0]
            )

            var, ld_score = SummaryStatisticsImputation._compute_var(
                ld_matrix_known_missing, sig_t_inv, lamb
            )

            mu = SummaryStatisticsImputation._compute_mu(
                ld_matrix_known_missing, sig_t_inv, z_scores_known
            )
            var_norm = SummaryStatisticsImputation._var_in_boundaries(var, lamb)

            R2 = (1 + lamb) - var_norm

            mu = mu / np.sqrt(R2)
            return {
                "var": var,
                "mu": mu,
                "ld_score": ld_score,
                "condition_number": condition_number,
                "correct_inversion": correct_inversion,
                "imputation_r2": 1 - var,
            }

    @staticmethod
    def _compute_mu(
        sig_i_t: np.ndarray, sig_t_inv: np.ndarray, zt: np.ndarray
    ) -> np.ndarray:
        """Compute the estimation of z-score from neighborring snp.

        Args:
            sig_i_t (np.ndarray) : correlation matrix with line corresponding to unknown Snp (snp to impute) and column to known SNPs
            sig_t_inv (np.ndarray): inverse of the correlation matrix of known matrix
            zt (np.ndarray): Zscores of known snp
        Returns:
            np.ndarray: a vector of length i containing the estimate of zscore

        """
        return np.dot(sig_i_t, np.dot(sig_t_inv, zt))

    @staticmethod
    def _compute_var(
        sig_i_t: np.ndarray, sig_t_inv: np.ndarray, lamb: float
    ) -> tuple[np.ndarray, np.ndarray]:
        """Compute the expected variance of the imputed SNPs.

        Args:
            sig_i_t (np.ndarray) : correlation matrix with line corresponding to unknown Snp (snp to impute) and column to known SNPs
            sig_t_inv (np.ndarray): inverse of the correlation matrix of known matrix
            lamb (float): regularization term added to matrix

        Returns:
            tuple[np.ndarray, np.ndarray]: a tuple containing the variance and the ld score
        """
        var = (1 + lamb) - np.einsum(
            "ij,jk,ki->i", sig_i_t, sig_t_inv, sig_i_t.transpose()
        )
        ld_score = (sig_i_t**2).sum(1)

        return var, ld_score

    @staticmethod
    def _check_inversion(sig_t: np.ndarray, sig_t_inv: np.ndarray) -> bool:
        """Check if the inversion is correct.

        Args:
            sig_t (np.ndarray): the correlation matrix
            sig_t_inv (np.ndarray): the inverse of the correlation matrix
        Returns:
            bool: True if the inversion is correct, False otherwise
        """
        return np.allclose(sig_t, np.dot(sig_t, np.dot(sig_t_inv, sig_t)))

    @staticmethod
    def _var_in_boundaries(var: np.ndarray, lamb: float) -> np.ndarray:
        """Forces the variance to be in the 0 to 1+lambda boundary. Theoritically we shouldn't have to do that.

        Args:
            var (np.ndarray): the variance of the imputed SNPs
            lamb (float): regularization term added to the diagonal of the sig_t matrix

        Returns:
            np.ndarray: the variance of the imputed SNPs
        """
        id_neg = np.where(var < 0)
        var[id_neg] = 0
        id_inf = np.where(var > (0.99999 + lamb))
        var[id_inf] = 1

        return var

    @staticmethod
    def _invert_sig_t(sig_t: np.ndarray, lamb: float, rtol: float) -> np.ndarray:
        """Invert the correlation matrix. If the provided regularization values are not enough to stabilize the inversion process for the given matrix, the function calls itself recursively, increasing lamb and rtol by 10%.

        Args:
            sig_t (np.ndarray): the correlation matrix
            lamb (float): regularization term added to the diagonal of the sig_t matrix
            rtol (float): threshold to filter eigenvector with a eigenvalue under rtol make inversion biased but much more numerically robust

        Returns:
            np.ndarray: the inverse of the correlation matrix
        """
        try:
            np.fill_diagonal(sig_t, (1 + lamb))
            sig_t_inv = scipy.linalg.pinv(sig_t, rtol=rtol, atol=0)
            return sig_t_inv
        except np.linalg.LinAlgError:
            return SummaryStatisticsImputation._invert_sig_t(
                sig_t, lamb * 1.1, rtol * 1.1
            )

raiss_model(z_scores_known: np.ndarray, ld_matrix_known: np.ndarray, ld_matrix_known_missing: np.ndarray, lamb: float = 0.01, rtol: float = 0.01) -> dict[str, Any] staticmethod

Compute the imputation of the z-score using the RAISS model.

Parameters:

Name Type Description Default
z_scores_known ndarray

the vector of known Z scores

required
ld_matrix_known np.ndarray)

the matrix of known LD correlations

required
ld_matrix_known_missing ndarray

LD matrix of known SNPs with other unknown SNPs in large matrix (similar to ld[unknowns, :][:,known])

required
lamb float

size of the small value added to the diagonal of the covariance matrix before inversion. Defaults to 0.01.

0.01
rtol float

threshold to filter eigenvectos by its eigenvalue. It makes an inversion biased but much more numerically robust. Default to 0.01.

0.01

Returns:

Type Description
dict[str, Any]

dict[str, Any]: - var (np.ndarray): variance of the imputed SNPs - mu (np.ndarray): the estimation of the zscore of the imputed SNPs - ld_score (np.ndarray): the linkage disequilibrium score of the imputed SNPs - condition_number (np.ndarray): the condition number of the correlation matrix - correct_inversion (np.ndarray): a boolean array indicating if the inversion was successful - imputation_r2 (np.ndarray): the R2 of the imputation

Source code in src/gentropy/method/sumstat_imputation.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
@staticmethod
def raiss_model(
    z_scores_known: np.ndarray,
    ld_matrix_known: np.ndarray,
    ld_matrix_known_missing: np.ndarray,
    lamb: float = 0.01,
    rtol: float = 0.01,
) -> dict[str, Any]:
    """Compute the imputation of the z-score using the RAISS model.

    Args:
        z_scores_known (np.ndarray): the vector of known Z scores
        ld_matrix_known (np.ndarray) : the matrix of known LD correlations
        ld_matrix_known_missing (np.ndarray): LD matrix of known SNPs with other unknown SNPs in large matrix (similar to `ld[unknowns, :][:,known]`)
        lamb (float): size of the small value added to the diagonal of the covariance matrix before inversion. Defaults to 0.01.
        rtol (float): threshold to filter eigenvectos by its eigenvalue. It makes an inversion biased but much more numerically robust. Default to 0.01.

    Returns:
        dict[str, Any]:
            - var (np.ndarray): variance of the imputed SNPs
            - mu (np.ndarray): the estimation of the zscore of the imputed SNPs
            - ld_score (np.ndarray): the linkage disequilibrium score of the imputed SNPs
            - condition_number (np.ndarray): the condition number of the correlation matrix
            - correct_inversion (np.ndarray): a boolean array indicating if the inversion was successful
            - imputation_r2 (np.ndarray): the R2 of the imputation
    """
    sig_t_inv = SummaryStatisticsImputation._invert_sig_t(
        ld_matrix_known, lamb, rtol
    )
    if sig_t_inv is None:
        return {
            "var": None,
            "mu": None,
            "ld_score": None,
            "condition_number": None,
            "correct_inversion": None,
            "imputation_r2": None,
        }
    else:
        condition_number = np.array(
            [np.linalg.cond(ld_matrix_known)] * ld_matrix_known_missing.shape[0]
        )
        correct_inversion = np.array(
            [
                SummaryStatisticsImputation._check_inversion(
                    ld_matrix_known, sig_t_inv
                )
            ]
            * ld_matrix_known_missing.shape[0]
        )

        var, ld_score = SummaryStatisticsImputation._compute_var(
            ld_matrix_known_missing, sig_t_inv, lamb
        )

        mu = SummaryStatisticsImputation._compute_mu(
            ld_matrix_known_missing, sig_t_inv, z_scores_known
        )
        var_norm = SummaryStatisticsImputation._var_in_boundaries(var, lamb)

        R2 = (1 + lamb) - var_norm

        mu = mu / np.sqrt(R2)
        return {
            "var": var,
            "mu": mu,
            "ld_score": ld_score,
            "condition_number": condition_number,
            "correct_inversion": correct_inversion,
            "imputation_r2": 1 - var,
        }