Single multi-omic dataset (mouse brain)
[ ]:
import numpy as np
import scanpy as sc
import scvelo as scv
import torch
import matplotlib.pyplot as plt
import umap
import os
import multivelovae as vv
from datetime import datetime
[2]:
torch.cuda.is_available()
[2]:
True
[3]:
now = datetime.now()
date = now.strftime("%m_%d_%Y")
Read in processed data and define places to store output
[4]:
dataset = "10XMouse"
adata = sc.read_h5ad('adata_postpro.h5ad')
adata_atac = sc.read_h5ad('adata_atac_postpro.h5ad')
[5]:
model_path_base = f"checkpoints/{dataset}"
figure_path_base = f"figures/{dataset}"
data_path_base = f"data/{dataset}"
[6]:
gene_plot = ["Robo2", "Satb2", 'Gria2', 'Grin2b', 'Eomes', 'Tle4']
np.all(np.isin(gene_plot, adata.var_names))
[6]:
True
[7]:
adata_atac.layers['Mc'] = adata_atac.layers['Mc'].A
[8]:
adata, adata_atac
[8]:
(AnnData object with n_obs × n_vars = 3365 × 936
obs: 'n_counts', 'leiden'
var: 'Accession', 'Chromosome', 'End', 'Start', 'Strand'
uns: 'leiden_colors', 'neighbors', 'pca', 'umap'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
layers: 'Ms', 'Mu', 'ambiguous', 'matrix', 'spliced', 'unspliced'
obsp: 'connectivities', 'distances',
AnnData object with n_obs × n_vars = 3365 × 936
obs: 'n_counts'
layers: 'Mc'
obsp: 'connectivities')
[9]:
os.makedirs(figure_path_base, exist_ok=True)
scv.pl.scatter(adata, basis='umap', color='leiden')
[10]:
# Let's split RG, Astro, and OPC
adata_sub = adata[adata.obs['leiden'] == 'RG, Astro, OPC', :].copy()
sc.tl.leiden(adata_sub, resolution=0.3)
sc.pl.umap(adata_sub, color='leiden')
[11]:
adata.obs['leiden'] = adata.obs['leiden'].astype('str')
adata.obs.loc[adata_sub[adata_sub.obs['leiden']=='0', :].obs_names, 'leiden'] = 'Astrocyte'
adata.obs.loc[adata_sub[adata_sub.obs['leiden']=='1', :].obs_names, 'leiden'] = 'Radial Glia'
adata.obs.loc[adata_sub[adata_sub.obs['leiden']=='2', :].obs_names, 'leiden'] = 'OPC'
adata.obs.loc[adata.obs['leiden']=='Ependymal cells', 'leiden'] = 'Ependymal'
adata.obs.loc[adata.obs['leiden']=='IPC', 'leiden'] = 'nIPC'
[12]:
scv.pl.scatter(adata, basis='umap', color='leiden', save=figure_path_base+"/umap.png")
saving figure to file figures/10XMouse/umap.png
[13]:
figure_path = figure_path_base+'/'+date
model_path = model_path_base+'/'+date
data_path = data_path_base
Initialize and train a MultiVeloVAE
[14]:
key = 'vae'
[15]:
torch.manual_seed(2022)
np.random.seed(2022)
model = vv.VAEChrom(adata,
adata_atac,
device='cuda:0',
plot_init=False,
gene_plot=gene_plot,
cluster_key="leiden",
figure_path=figure_path,
embed="umap")
model.train(plot=False,
gene_plot=gene_plot,
figure_path=figure_path,
embed="umap")
model.save_model(model_path)
model.save_anndata(data_path, file_name="out.h5ad")
Latent dimension set to 7.
Learning rate set to 5.8e-4 based on data sparsity.
Early stop threshold set to 0.5.
Using Gaussian Prior.
Initializing using the steady-state and dynamical models.
862 out of 936 = 92.1% genes have good ellipse fits.
KS-test result: [1. 1. 1. 1. 1. 1. 1.]
Assigning cluster 6 to repressive.
Initial induction: 734, repression: 202 out of 936.
-------------------------- Train a MultiVeloVAE -------------------------
********* Creating Training and Validation Datasets *********
Total Number of Iterations Per Epoch: 10, test iteration: 18
********* Finished. *********
********* Stage 1 *********
Epoch 1: Train ELBO = -3673.793, Test ELBO = -20168.468 Total Time = 0 h : 0 m : 0 s
Epoch 50: Train ELBO = 3012.307, Test ELBO = 3009.576 Total Time = 0 h : 0 m : 3 s
Epoch 100: Train ELBO = 3264.893, Test ELBO = 3246.065 Total Time = 0 h : 0 m : 5 s
Epoch 150: Train ELBO = 3358.157, Test ELBO = 3345.258 Total Time = 0 h : 0 m : 8 s
Epoch 200: Train ELBO = 3394.092, Test ELBO = 3378.623 Total Time = 0 h : 0 m : 11 s
Epoch 250: Train ELBO = 3413.315, Test ELBO = 3403.028 Total Time = 0 h : 0 m : 14 s
Epoch 300: Train ELBO = 3425.801, Test ELBO = 3411.986 Total Time = 0 h : 0 m : 17 s
Epoch 350: Train ELBO = 3432.051, Test ELBO = 3418.478 Total Time = 0 h : 0 m : 19 s
Epoch 400: Train ELBO = 3435.243, Test ELBO = 3422.525 Total Time = 0 h : 0 m : 22 s
Epoch 450: Train ELBO = 3440.930, Test ELBO = 3427.428 Total Time = 0 h : 0 m : 25 s
Epoch 500: Train ELBO = 3445.022, Test ELBO = 3429.870 Total Time = 0 h : 0 m : 28 s
Epoch 550: Train ELBO = 3449.475, Test ELBO = 3431.691 Total Time = 0 h : 0 m : 30 s
Epoch 600: Train ELBO = 3452.604, Test ELBO = 3436.290 Total Time = 0 h : 0 m : 33 s
Epoch 650: Train ELBO = 3453.146, Test ELBO = 3436.094 Total Time = 0 h : 0 m : 36 s
Epoch 700: Train ELBO = 3455.825, Test ELBO = 3439.674 Total Time = 0 h : 0 m : 38 s
Epoch 750: Train ELBO = 3457.769, Test ELBO = 3440.246 Total Time = 0 h : 0 m : 41 s
Epoch 800: Train ELBO = 3457.021, Test ELBO = 3441.046 Total Time = 0 h : 0 m : 44 s
Epoch 850: Train ELBO = 3461.260, Test ELBO = 3441.756 Total Time = 0 h : 0 m : 47 s
Epoch 900: Train ELBO = 3459.185, Test ELBO = 3444.346 Total Time = 0 h : 0 m : 49 s
Epoch 950: Train ELBO = 3462.295, Test ELBO = 3446.231 Total Time = 0 h : 0 m : 52 s
Epoch 1000: Train ELBO = 3464.646, Test ELBO = 3444.447 Total Time = 0 h : 0 m : 55 s
Epoch 1050: Train ELBO = 3463.852, Test ELBO = 3447.959 Total Time = 0 h : 0 m : 58 s
********* Stage 1: Early Stop Triggered at epoch 1061. *********
********* Retrieving best model from iteration 10315. *********
********* Stage 2 *********
Cell-wise KNN estimation.
Using 117 latent neighbors to select ancestors.
Percentage of Invalid Sets: 0.002
Average Set Size: 14
Finished. Actual Time: 0 h : 0 m : 2 s
********* Velocity Refinement Round 1 *********
Epoch 1086: Train ELBO = 3391.065, Test ELBO = 3382.398 Total Time = 0 h : 1 m : 2 s
********* Round 1: Early Stop Triggered at epoch 1086. *********
********* Retrieving best model from iteration 10847. *********
Cell-wise KNN estimation.
Finished. Actual Time: 0 h : 0 m : 0 s
********* Velocity Refinement Round 2 *********
Epoch 1110: Train ELBO = 3323.292, Test ELBO = 3332.454 Total Time = 0 h : 1 m : 3 s
********* Round 2: Early Stop Triggered at epoch 1110. *********
********* Retrieving best model from iteration 11081. *********
Change in x0: 0.183
Cell-wise KNN estimation.
Finished. Actual Time: 0 h : 0 m : 0 s
********* Velocity Refinement Round 3 *********
Epoch 1135: Train ELBO = 3263.203, Test ELBO = 3274.866 Total Time = 0 h : 1 m : 5 s
********* Round 3: Early Stop Triggered at epoch 1135. *********
********* Retrieving best model from iteration 11324. *********
Change in x0: 0.152
Cell-wise KNN estimation.
Finished. Actual Time: 0 h : 0 m : 0 s
********* Velocity Refinement Round 4 *********
Epoch 1141: Train ELBO = 3194.874, Test ELBO = 3210.376 Total Time = 0 h : 1 m : 6 s
********* Round 4: Early Stop Triggered at epoch 1141. *********
********* Retrieving best model from iteration 11378. *********
Change in x0: 0.138
Cell-wise KNN estimation.
Finished. Actual Time: 0 h : 0 m : 0 s
********* Velocity Refinement Round 5 *********
Epoch 1164: Train ELBO = 3133.437, Test ELBO = 3155.457 Total Time = 0 h : 1 m : 7 s
********* Round 5: Early Stop Triggered at epoch 1164. *********
********* Retrieving best model from iteration 11603. *********
Change in x0: 0.123
Cell-wise KNN estimation.
Finished. Actual Time: 0 h : 0 m : 0 s
********* Velocity Refinement Round 6 *********
Epoch 1170: Train ELBO = 3075.613, Test ELBO = 3098.700 Total Time = 0 h : 1 m : 8 s
********* Round 6: Early Stop Triggered at epoch 1170. *********
********* Retrieving best model from iteration 11657. *********
Change in x0: 0.113
Final: Train ELBO = 3075.613, Test ELBO = 3098.700
********* Finished. Total Time = 0 h : 1 m : 8 s *********
Computing velocity.
Selected 656 velocity genes.
Writing anndata output to file.
Downstream analyses
[16]:
std_z = adata.obsm[f"{key}_std_z"]
z = adata.obsm[f"{key}_z"]
[17]:
# UMAP embedding from latent cell space
umap_obj = umap.UMAP(n_neighbors=30, n_components=2, min_dist=0.25, random_state=2022)
z_umap = umap_obj.fit_transform(z)
[18]:
adata.obsm['X_z_umap'] = z_umap
[19]:
# Compute velocity graph and visualize it, together with inferred latent time
vv.model.velocity_graph(adata, key=key)
vv.velocity_embedding_stream(adata, key=key, basis='umap', color='leiden', title="", figsize=(8,6), legend_loc='right margin')
vv.velocity_embedding_stream(adata, key=key, basis='umap', color=f'{key}_time', color_map='gnuplot', title="", figsize=(8,6), legend_loc='right margin')
computing velocity graph (using 1/8 cores)
finished (0:00:04) --> added
'vae_velocity_norm_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity embedding
finished (0:00:00) --> added
'vae_velocity_norm_umap', embedded velocity vectors (adata.obsm)
[20]:
# Show Z UMAP
vv.velocity_embedding_stream(adata, key=key, basis='z_umap', color='leiden', title="", figsize=(8,6), legend_loc='right margin')
vv.velocity_embedding_stream(adata, key=key, basis='z_umap', color=f'{key}_time', color_map='gnuplot', title="", figsize=(8,6), legend_loc='right margin')
computing velocity embedding
finished (0:00:00) --> added
'vae_velocity_norm_z_umap', embedded velocity vectors (adata.obsm)
[21]:
# Decouping and coupling factors on UMAPs
adata.layers[f'{key}_decoupling'] = adata.layers[f'{key}_kc'] - adata.layers[f'{key}_rho']
adata.layers[f'{key}_coupling'] = adata.layers[f'{key}_kc'] + adata.layers[f'{key}_rho'] - 1
scv.pl.scatter(adata, basis='umap', color=np.mean(adata[:, adata.var[f'{key}_velocity_genes']].layers[f'{key}_decoupling'], axis=1), legend_loc='right margin')
scv.pl.scatter(adata, basis='z_umap', color=np.mean(adata[:, adata.var[f'{key}_velocity_genes']].layers[f'{key}_decoupling'], axis=1), legend_loc='right margin')
[22]:
# Dynamic plots of modality level by latent time for genes of interest
vv.dynamic_plot(adata, adata_atac, gene_plot, color_by='leiden')
[23]:
# Dynamic plots of reconstructed values only
vv.dynamic_plot(adata, adata_atac, gene_plot, color_by='leiden', show_pred_only=True)
[24]:
# Phase portraits of recontructed unspliced vs spliced for genes of interest, colored by cell type, latent time, and transcription state
scv.pl.scatter(adata, basis=['Robo2', 'Satb2', 'Gria2', 'Grin2b'], x=f'{key}_shat', y=f'{key}_uhat', color='leiden', legend_loc='none', frameon=False, fontsize=20)
scv.pl.scatter(adata, basis=['Robo2', 'Satb2', 'Gria2', 'Grin2b'], x=f'{key}_shat', y=f'{key}_uhat', color=f'{key}_time', legend_loc='none', frameon=False, fontsize=20, color_map='gnuplot')
scv.pl.scatter(adata, basis=['Robo2', 'Satb2', 'Gria2', 'Grin2b'], x=f'{key}_shat', y=f'{key}_uhat', color=f'{key}_rho', legend_loc='none', frameon=False, fontsize=20, color_map='coolwarm')
[25]:
# Phase portraits of recontructed chromatin vs unspliced for genes of interest, colored by cell type, latent time, and transcription state
scv.pl.scatter(adata, basis=['Robo2', 'Satb2', 'Gria2', 'Grin2b'], x=f'{key}_uhat', y=f'{key}_chat', color='leiden', legend_loc='none', frameon=False, fontsize=20)
scv.pl.scatter(adata, basis=['Robo2', 'Satb2', 'Gria2', 'Grin2b'], x=f'{key}_uhat', y=f'{key}_chat', color=f'{key}_time', legend_loc='none', frameon=False, fontsize=20, color_map='gnuplot')
scv.pl.scatter(adata, basis=['Robo2', 'Satb2', 'Gria2', 'Grin2b'], x=f'{key}_uhat', y=f'{key}_chat', color=f'{key}_kc', legend_loc='none', frameon=False, fontsize=20, color_map='coolwarm')
Differential (de)coupling test
[26]:
import plotnine as p9
p9.options.dpi = 100
p9.options.figure_size = (5, 4)
[27]:
# Define comparison pairs
a_list = [['V-SVZ'],
['Upper Layer'],
['Upper Layer', 'Deeper Layer'],
['Deeper Layer']]
b_list = [['nIPC', 'Upper Layer'],
['V-SVZ', 'Deeper Layer'],
['V-SVZ'],
['V-SVZ', 'Upper Layer']]
[28]:
def differential_func(adata, adata_atac, a, b):
df_dd = vv.model.differential_dynamics(adata, adata_atac, model=model, idx1=a, idx2=b, mode='change', test_decoupling=True, save_raw=True)
df_dd["log10_pscore"] = np.log10(df_dd["p_decoupling_no_change"])
df_dd["gene_type"] = "Other"
df_dd.loc['Satb2', "gene_type"] = 'Satb2'
df_dd.loc['Robo2', "gene_type"] = 'Robo2'
df_dd.loc['Gria2', "gene_type"] = 'Gria2'
df_dd['gene_name'] = df_dd.index
df_dd.loc[~np.isin(df_dd['gene_name'], ['Satb2', 'Robo2', 'Gria2']), 'gene_name'] = np.nan
p = (
p9.ggplot(df_dd, p9.aes("log2_diff_decoupling", "-log10_pscore", color="gene_type"))
+ p9.geom_point(
df_dd.query("gene_type == 'Other'"), size=0.5, alpha=0.5, shape='.'
) # Plot other genes with transparence
+ p9.geom_point(df_dd.query("gene_type != 'Other'"))
+ p9.labs(x="Mean log2 decoupling factor (M1 --> M2)", y="Significance score")
+ p9.geom_text(p9.aes(label="gene_name"), nudge_y=0.08, nudge_x=-0.05, na_rm=True, size=10)
+ p9.scale_color_manual(values=['black', 'tab:red', 'tab:red', 'tab:red'])
+ p9.ggtitle("Differential decoupling")
+ p9.theme(legend_position='none')
)
print(p)
df_dd["log10_pscore"] = np.log10(df_dd["p_coupling_no_change"])
df_dd["gene_type"] = "Other"
df_dd.loc['Satb2', "gene_type"] = 'Satb2'
df_dd.loc['Robo2', "gene_type"] = 'Robo2'
df_dd.loc['Gria2', "gene_type"] = 'Gria2'
df_dd['gene_name'] = df_dd.index
df_dd.loc[~np.isin(df_dd['gene_name'], ['Satb2', 'Robo2', 'Gria2']), 'gene_name'] = np.nan
p = (
p9.ggplot(df_dd, p9.aes("log2_diff_coupling", "-log10_pscore", color="gene_type"))
+ p9.geom_point(
df_dd.query("gene_type == 'Other'"), size=0.5, alpha=0.5, shape='.'
) # Plot other genes with transparence
+ p9.geom_point(df_dd.query("gene_type != 'Other'"))
+ p9.labs(x="Mean log2 coupling factor (repression --> induction)", y="Significance score")
+ p9.geom_text(p9.aes(label="gene_name"), nudge_y=0.08, nudge_x=-0.1, na_rm=True, size=10)
+ p9.scale_color_manual(values=['black', 'tab:red', 'tab:red', 'tab:red'])
+ p9.ggtitle("Differential coupling")
+ p9.theme(legend_position='none')
)
print(p)
[29]:
# Plot volcano plots for each comparison
for a, b in zip(a_list, b_list):
print(f"Comparing {a} vs {b}")
a = np.isin(adata.obs['leiden'], a)
b = np.isin(adata.obs['leiden'], b)
differential_func(adata, adata_atac, a, b)
Comparing ['V-SVZ'] vs ['nIPC', 'Upper Layer']
Comparing ['Upper Layer'] vs ['V-SVZ', 'Deeper Layer']
Comparing ['Upper Layer', 'Deeper Layer'] vs ['V-SVZ']
Comparing ['Deeper Layer'] vs ['V-SVZ', 'Upper Layer']
Distributions of inferred rate parameters
[30]:
fig, axs = plt.subplots(1, 6, figsize=(40, 5))
axs[0].hist(adata.var[f'{key}_alpha_c'], bins=100);
axs[1].hist(adata.var[f'{key}_alpha'], bins=100);
axs[2].hist(adata.var[f'{key}_beta'], bins=100);
axs[3].hist(adata.var[f'{key}_gamma'], bins=100);
axs[4].hist(adata.var[f'{key}_scaling_c'], bins=100);
axs[5].hist(adata.var[f'{key}_scaling_u'], bins=100);
Save the final result
[31]:
adata.write_h5ad(data_path_base+"/final.h5ad")
[ ]: