diff --git a/netZooPy/cobra/cobra.py b/netZooPy/cobra/cobra.py index 775a9250..cc97a8c6 100644 --- a/netZooPy/cobra/cobra.py +++ b/netZooPy/cobra/cobra.py @@ -1,9 +1,10 @@ import pandas as pd import numpy as np from scipy.linalg import eigh,pinv +from sklearn.linear_model import LinearRegression, Lasso -def cobra(X, expression): +def cobra(X, expression, cobra='nnls', alpha: np.float64=0.1, mode='corr'): """ COBRA decomposes a (partial) gene co-expression matrix as a linear combination of covariate-specific components. @@ -13,9 +14,21 @@ def cobra(X, expression): Parameters ----------- X : np.ndarray, pd.DataFrame - design matrix of size (n, q), n = number of samples, q = number of covariates + design matrix of size (n, q), n = number of samples, q = number of covariates. + Please add an intercept column in X (first column made of ones). expression : np.ndarray, pd.DataFrame gene expression as a matrix of size (g, n), g = number of genes + cobra : string + regression mode + nnls: Non-negative least square (default) + nnlasso: Non-negative LASSO + MLE: Maximum likelihood estimation + alpha : np.float64 + Regularization parameter for the LASSO model (default 0.1). Only used when cobra='nnlasso'. + mode : string + Type of matrix to decompose. + corr: Correlation matrix (default). Gene expressions are centered and normalized to unit norm. + cov: Covariance matrix. Gene expressions are centered and divided by sqrt(n-1). Returns --------- psi : array @@ -43,8 +56,13 @@ def cobra(X, expression): _, q = X.shape # Standardize Gene Expressions - g = expression - expression.mean(axis=1).reshape(-1, 1) - g = g / np.linalg.norm(g, axis=1)[:, None] + g = expression - expression.mean(axis=1).reshape(-1, 1) + if mode == 'corr': + g = g / np.linalg.norm(g, axis=1)[:, None] + elif mode == 'cov': + g = g / np.sqrt(n - 1) + else: + raise ValueError("mode must be 'corr' or 'cov'.") # Co-expression Matrix c = np.dot(g, g.T) @@ -55,26 +73,32 @@ def cobra(X, expression): Q = c_eigenvectors[:, indices_nonzero].T[::-1].T # - gtq = np.matmul(g.T, Q) d = c_eigenvalues[indices_nonzero][::-1] - # - xtx_inv = np.linalg.pinv( - np.dot(X.T, X) - ) - xtx_inv_xt = np.dot( - xtx_inv, X.T - ) + if cobra=='nnls': + model = LinearRegression(positive=True, fit_intercept=False).fit(X, np.diag(d) ) + psi = np.transpose(model.coef_) + elif cobra=='nnlasso': + model = Lasso(alpha=alpha, positive=True, fit_intercept=False).fit(X, np.diag(d) ) + psi = np.transpose(model.coef_) + elif cobra=='MLE': + gtq = np.matmul(g.T, Q) + xtx_inv = np.linalg.pinv( + np.dot(X.T, X) + ) + xtx_inv_xt = np.dot( + xtx_inv, X.T + ) - # - psi = np.zeros((q, n)) + # + psi = np.zeros((q, n)) - for i in range(q): - for h in range(n): - psi[i, h] = n * np.sum([ - ( - xtx_inv_xt[i, k] * gtq[k, h] ** 2 - ) for k in range(n) - ]) + for i in range(q): + for h in range(n): + psi[i, h] = n * np.sum([ + ( + xtx_inv_xt[i, k] * gtq[k, h] ** 2 + ) for k in range(n) + ]) return psi, Q, d, g diff --git a/netZooPy/condor/condor.py b/netZooPy/condor/condor.py index 0e755bd9..60a9e2c8 100644 --- a/netZooPy/condor/condor.py +++ b/netZooPy/condor/condor.py @@ -305,13 +305,13 @@ def matrices(self, c,resolution): # Computes weighted biadjacency matrix. A = np.matrix(np.zeros((p, q))) for edge in self.net.iterrows(): - A[gn[edge[1][1]], rg[edge[1][0]]] = edge[1][2] + A[gn[edge[1].iloc[1]], rg[edge[1].iloc[0]]] = edge[1].iloc[2] # Computes node degrees for the nodesets. ki = A.sum(1) dj = A.sum(0) # Computes sum of edges and bimodularity matrix. - m = float(sum(ki)) + m = float(np.asarray(ki).sum()) B = A - resolution*((ki @ dj) / m) # d = self.index_dict diff --git a/requirements.txt b/requirements.txt index 7e2c6abf..15959cd7 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,5 @@ igraph joblib statsmodels click -torch \ No newline at end of file +scikit-learn +torch diff --git a/tests/test_cobra.py b/tests/test_cobra.py index 2fe940fc..f5c12108 100644 --- a/tests/test_cobra.py +++ b/tests/test_cobra.py @@ -23,7 +23,7 @@ def test_cobra(): G_gt = pd.read_csv(G_gt_path, index_col=0) # Call COBRA - psi, Q, D, G = cobra.cobra(X, expression) + psi, Q, D, G = cobra.cobra(X, expression, cobra='MLE') # Cast output to pandas psi = pd.DataFrame(psi, index=psi_gt.index, columns=psi_gt.columns) @@ -37,7 +37,8 @@ def test_cobra(): pd.testing.assert_frame_equal(G, G_gt, rtol=1e-10, check_exact=False) q = psi.shape[0] + X_mean = np.mean(X.to_numpy(), axis=0) for i in range(q): - C = Q.to_numpy().dot(np.mean(X, axis=0)[i] * np.diag(psi.to_numpy()[i, :])).dot(Q.to_numpy().T) - C_gt = Q_gt.to_numpy().dot(np.mean(X, axis=0)[i] * np.diag(psi_gt.to_numpy()[i, :])).dot(Q_gt.to_numpy().T) + C = Q.to_numpy().dot(X_mean[i] * np.diag(psi.to_numpy()[i, :])).dot(Q.to_numpy().T) + C_gt = Q_gt.to_numpy().dot(X_mean[i] * np.diag(psi_gt.to_numpy()[i, :])).dot(Q_gt.to_numpy().T) pd.testing.assert_frame_equal(pd.DataFrame(C), pd.DataFrame(C_gt), rtol=1e-10, check_exact=False)