Single multi-omic dataset (mouse brain)

[ ]:

import numpy as np
import scanpy as sc
import scvelo as scv
import torch
import matplotlib.pyplot as plt
import umap
import os
import multivelovae as vv
from datetime import datetime

[2]:

torch.cuda.is_available()

[2]:

True

[3]:

now = datetime.now()
date = now.strftime("%m_%d_%Y")

Read in processed data and define places to store output

[4]:

dataset = "10XMouse"
adata = sc.read_h5ad('adata_postpro.h5ad')
adata_atac = sc.read_h5ad('adata_atac_postpro.h5ad')

[5]:

model_path_base = f"checkpoints/{dataset}"
figure_path_base = f"figures/{dataset}"
data_path_base = f"data/{dataset}"

[6]:

gene_plot = ["Robo2", "Satb2", 'Gria2', 'Grin2b', 'Eomes', 'Tle4']
np.all(np.isin(gene_plot, adata.var_names))

[6]:

True

[7]:

adata_atac.layers['Mc'] = adata_atac.layers['Mc'].A

[8]:

adata, adata_atac

[8]:

(AnnData object with n_obs × n_vars = 3365 × 936
     obs: 'n_counts', 'leiden'
     var: 'Accession', 'Chromosome', 'End', 'Start', 'Strand'
     uns: 'leiden_colors', 'neighbors', 'pca', 'umap'
     obsm: 'X_pca', 'X_umap'
     varm: 'PCs'
     layers: 'Ms', 'Mu', 'ambiguous', 'matrix', 'spliced', 'unspliced'
     obsp: 'connectivities', 'distances',
 AnnData object with n_obs × n_vars = 3365 × 936
     obs: 'n_counts'
     layers: 'Mc'
     obsp: 'connectivities')

[9]:

os.makedirs(figure_path_base, exist_ok=True)
scv.pl.scatter(adata, basis='umap', color='leiden')

[10]:

# Let's split RG, Astro, and OPC
adata_sub = adata[adata.obs['leiden'] == 'RG, Astro, OPC', :].copy()
sc.tl.leiden(adata_sub, resolution=0.3)
sc.pl.umap(adata_sub, color='leiden')

[11]:

adata.obs['leiden'] = adata.obs['leiden'].astype('str')
adata.obs.loc[adata_sub[adata_sub.obs['leiden']=='0', :].obs_names, 'leiden'] = 'Astrocyte'
adata.obs.loc[adata_sub[adata_sub.obs['leiden']=='1', :].obs_names, 'leiden'] = 'Radial Glia'
adata.obs.loc[adata_sub[adata_sub.obs['leiden']=='2', :].obs_names, 'leiden'] = 'OPC'
adata.obs.loc[adata.obs['leiden']=='Ependymal cells', 'leiden'] = 'Ependymal'
adata.obs.loc[adata.obs['leiden']=='IPC', 'leiden'] = 'nIPC'

[12]:

scv.pl.scatter(adata, basis='umap', color='leiden', save=figure_path_base+"/umap.png")

saving figure to file figures/10XMouse/umap.png

[13]:

figure_path = figure_path_base+'/'+date
model_path = model_path_base+'/'+date
data_path = data_path_base

Initialize and train a MultiVeloVAE

[14]:

key = 'vae'

[15]:

torch.manual_seed(2022)
np.random.seed(2022)
model = vv.VAEChrom(adata,
                    adata_atac,
                    device='cuda:0',
                    plot_init=False,
                    gene_plot=gene_plot,
                    cluster_key="leiden",
                    figure_path=figure_path,
                    embed="umap")

model.train(plot=False,
            gene_plot=gene_plot,
            figure_path=figure_path,
            embed="umap")

model.save_model(model_path)
model.save_anndata(data_path, file_name="out.h5ad")

Latent dimension set to 7.
Learning rate set to 5.8e-4 based on data sparsity.
Early stop threshold set to 0.5.
Using Gaussian Prior.
Initializing using the steady-state and dynamical models.

862 out of 936 = 92.1% genes have good ellipse fits.
KS-test result: [1. 1. 1. 1. 1. 1. 1.]
Assigning cluster 6 to repressive.
Initial induction: 734, repression: 202 out of 936.
-------------------------- Train a MultiVeloVAE -------------------------
*********      Creating Training and Validation Datasets      *********
Total Number of Iterations Per Epoch: 10, test iteration: 18
*********                      Finished.                      *********
*********                      Stage  1                       *********
Epoch 1: Train ELBO = -3673.793, Test ELBO = -20168.468         Total Time =   0 h :  0 m :  0 s
Epoch 50: Train ELBO = 3012.307, Test ELBO = 3009.576           Total Time =   0 h :  0 m :  3 s
Epoch 100: Train ELBO = 3264.893, Test ELBO = 3246.065          Total Time =   0 h :  0 m :  5 s
Epoch 150: Train ELBO = 3358.157, Test ELBO = 3345.258          Total Time =   0 h :  0 m :  8 s
Epoch 200: Train ELBO = 3394.092, Test ELBO = 3378.623          Total Time =   0 h :  0 m : 11 s
Epoch 250: Train ELBO = 3413.315, Test ELBO = 3403.028          Total Time =   0 h :  0 m : 14 s
Epoch 300: Train ELBO = 3425.801, Test ELBO = 3411.986          Total Time =   0 h :  0 m : 17 s
Epoch 350: Train ELBO = 3432.051, Test ELBO = 3418.478          Total Time =   0 h :  0 m : 19 s
Epoch 400: Train ELBO = 3435.243, Test ELBO = 3422.525          Total Time =   0 h :  0 m : 22 s
Epoch 450: Train ELBO = 3440.930, Test ELBO = 3427.428          Total Time =   0 h :  0 m : 25 s
Epoch 500: Train ELBO = 3445.022, Test ELBO = 3429.870          Total Time =   0 h :  0 m : 28 s
Epoch 550: Train ELBO = 3449.475, Test ELBO = 3431.691          Total Time =   0 h :  0 m : 30 s
Epoch 600: Train ELBO = 3452.604, Test ELBO = 3436.290          Total Time =   0 h :  0 m : 33 s
Epoch 650: Train ELBO = 3453.146, Test ELBO = 3436.094          Total Time =   0 h :  0 m : 36 s
Epoch 700: Train ELBO = 3455.825, Test ELBO = 3439.674          Total Time =   0 h :  0 m : 38 s
Epoch 750: Train ELBO = 3457.769, Test ELBO = 3440.246          Total Time =   0 h :  0 m : 41 s
Epoch 800: Train ELBO = 3457.021, Test ELBO = 3441.046          Total Time =   0 h :  0 m : 44 s
Epoch 850: Train ELBO = 3461.260, Test ELBO = 3441.756          Total Time =   0 h :  0 m : 47 s
Epoch 900: Train ELBO = 3459.185, Test ELBO = 3444.346          Total Time =   0 h :  0 m : 49 s
Epoch 950: Train ELBO = 3462.295, Test ELBO = 3446.231          Total Time =   0 h :  0 m : 52 s
Epoch 1000: Train ELBO = 3464.646, Test ELBO = 3444.447         Total Time =   0 h :  0 m : 55 s
Epoch 1050: Train ELBO = 3463.852, Test ELBO = 3447.959         Total Time =   0 h :  0 m : 58 s
*********     Stage 1: Early Stop Triggered at epoch 1061.     *********
*********     Retrieving best model from iteration 10315.      *********
*********                      Stage  2                       *********
Cell-wise KNN estimation.
Using 117 latent neighbors to select ancestors.

Percentage of Invalid Sets: 0.002
Average Set Size: 14
Finished. Actual Time:   0 h :  0 m :  2 s
*********             Velocity Refinement Round 1             *********
Epoch 1086: Train ELBO = 3391.065, Test ELBO = 3382.398         Total Time =   0 h :  1 m :  2 s
*********     Round 1: Early Stop Triggered at epoch 1086.     *********
*********     Retrieving best model from iteration 10847.      *********
Cell-wise KNN estimation.
Finished. Actual Time:   0 h :  0 m :  0 s
*********             Velocity Refinement Round 2             *********
Epoch 1110: Train ELBO = 3323.292, Test ELBO = 3332.454         Total Time =   0 h :  1 m :  3 s
*********     Round 2: Early Stop Triggered at epoch 1110.     *********
*********     Retrieving best model from iteration 11081.      *********
Change in x0: 0.183
Cell-wise KNN estimation.
Finished. Actual Time:   0 h :  0 m :  0 s
*********             Velocity Refinement Round 3             *********
Epoch 1135: Train ELBO = 3263.203, Test ELBO = 3274.866         Total Time =   0 h :  1 m :  5 s
*********     Round 3: Early Stop Triggered at epoch 1135.     *********
*********     Retrieving best model from iteration 11324.      *********
Change in x0: 0.152
Cell-wise KNN estimation.
Finished. Actual Time:   0 h :  0 m :  0 s
*********             Velocity Refinement Round 4             *********
Epoch 1141: Train ELBO = 3194.874, Test ELBO = 3210.376         Total Time =   0 h :  1 m :  6 s
*********     Round 4: Early Stop Triggered at epoch 1141.     *********
*********     Retrieving best model from iteration 11378.      *********
Change in x0: 0.138
Cell-wise KNN estimation.
Finished. Actual Time:   0 h :  0 m :  0 s
*********             Velocity Refinement Round 5             *********
Epoch 1164: Train ELBO = 3133.437, Test ELBO = 3155.457         Total Time =   0 h :  1 m :  7 s
*********     Round 5: Early Stop Triggered at epoch 1164.     *********
*********     Retrieving best model from iteration 11603.      *********
Change in x0: 0.123
Cell-wise KNN estimation.
Finished. Actual Time:   0 h :  0 m :  0 s
*********             Velocity Refinement Round 6             *********
Epoch 1170: Train ELBO = 3075.613, Test ELBO = 3098.700         Total Time =   0 h :  1 m :  8 s
*********     Round 6: Early Stop Triggered at epoch 1170.     *********
*********     Retrieving best model from iteration 11657.      *********
Change in x0: 0.113
Final: Train ELBO = 3075.613,   Test ELBO = 3098.700
*********      Finished. Total Time =   0 h :  1 m :  8 s     *********
Computing velocity.
Selected 656 velocity genes.
Writing anndata output to file.

Downstream analyses

[16]:

std_z = adata.obsm[f"{key}_std_z"]
z = adata.obsm[f"{key}_z"]

[17]:

# UMAP embedding from latent cell space
umap_obj = umap.UMAP(n_neighbors=30, n_components=2, min_dist=0.25, random_state=2022)
z_umap = umap_obj.fit_transform(z)

[18]:

adata.obsm['X_z_umap'] = z_umap

[19]:

# Compute velocity graph and visualize it, together with inferred latent time
vv.model.velocity_graph(adata, key=key)
vv.velocity_embedding_stream(adata, key=key, basis='umap', color='leiden', title="", figsize=(8,6), legend_loc='right margin')
vv.velocity_embedding_stream(adata, key=key, basis='umap', color=f'{key}_time', color_map='gnuplot', title="", figsize=(8,6), legend_loc='right margin')

computing velocity graph (using 1/8 cores)

    finished (0:00:04) --> added
    'vae_velocity_norm_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity embedding
    finished (0:00:00) --> added
    'vae_velocity_norm_umap', embedded velocity vectors (adata.obsm)

[20]:

# Show Z UMAP
vv.velocity_embedding_stream(adata, key=key, basis='z_umap', color='leiden', title="", figsize=(8,6), legend_loc='right margin')
vv.velocity_embedding_stream(adata, key=key, basis='z_umap', color=f'{key}_time', color_map='gnuplot', title="", figsize=(8,6), legend_loc='right margin')

computing velocity embedding
    finished (0:00:00) --> added
    'vae_velocity_norm_z_umap', embedded velocity vectors (adata.obsm)

[21]:

# Decouping and coupling factors on UMAPs
adata.layers[f'{key}_decoupling'] = adata.layers[f'{key}_kc'] - adata.layers[f'{key}_rho']
adata.layers[f'{key}_coupling'] = adata.layers[f'{key}_kc'] + adata.layers[f'{key}_rho'] - 1
scv.pl.scatter(adata, basis='umap', color=np.mean(adata[:, adata.var[f'{key}_velocity_genes']].layers[f'{key}_decoupling'], axis=1), legend_loc='right margin')
scv.pl.scatter(adata, basis='z_umap', color=np.mean(adata[:, adata.var[f'{key}_velocity_genes']].layers[f'{key}_decoupling'], axis=1), legend_loc='right margin')

[22]:

# Dynamic plots of modality level by latent time for genes of interest
vv.dynamic_plot(adata, adata_atac, gene_plot, color_by='leiden')

[23]:

# Dynamic plots of reconstructed values only
vv.dynamic_plot(adata, adata_atac, gene_plot, color_by='leiden', show_pred_only=True)

[24]:

# Phase portraits of recontructed unspliced vs spliced for genes of interest, colored by cell type, latent time, and transcription state
scv.pl.scatter(adata, basis=['Robo2', 'Satb2', 'Gria2', 'Grin2b'], x=f'{key}_shat', y=f'{key}_uhat', color='leiden', legend_loc='none', frameon=False, fontsize=20)
scv.pl.scatter(adata, basis=['Robo2', 'Satb2', 'Gria2', 'Grin2b'], x=f'{key}_shat', y=f'{key}_uhat', color=f'{key}_time', legend_loc='none', frameon=False, fontsize=20, color_map='gnuplot')
scv.pl.scatter(adata, basis=['Robo2', 'Satb2', 'Gria2', 'Grin2b'], x=f'{key}_shat', y=f'{key}_uhat', color=f'{key}_rho', legend_loc='none', frameon=False, fontsize=20, color_map='coolwarm')

[25]:

# Phase portraits of recontructed chromatin vs unspliced for genes of interest, colored by cell type, latent time, and transcription state
scv.pl.scatter(adata, basis=['Robo2', 'Satb2', 'Gria2', 'Grin2b'], x=f'{key}_uhat', y=f'{key}_chat', color='leiden', legend_loc='none', frameon=False, fontsize=20)
scv.pl.scatter(adata, basis=['Robo2', 'Satb2', 'Gria2', 'Grin2b'], x=f'{key}_uhat', y=f'{key}_chat', color=f'{key}_time', legend_loc='none', frameon=False, fontsize=20, color_map='gnuplot')
scv.pl.scatter(adata, basis=['Robo2', 'Satb2', 'Gria2', 'Grin2b'], x=f'{key}_uhat', y=f'{key}_chat', color=f'{key}_kc', legend_loc='none', frameon=False, fontsize=20, color_map='coolwarm')

Differential (de)coupling test

[26]:

import plotnine as p9
p9.options.dpi = 100
p9.options.figure_size = (5, 4)

[27]:

# Define comparison pairs
a_list = [['V-SVZ'],
          ['Upper Layer'],
          ['Upper Layer', 'Deeper Layer'],
          ['Deeper Layer']]
b_list = [['nIPC', 'Upper Layer'],
          ['V-SVZ', 'Deeper Layer'],
          ['V-SVZ'],
          ['V-SVZ', 'Upper Layer']]

[28]:

def differential_func(adata, adata_atac, a, b):
    df_dd = vv.model.differential_dynamics(adata, adata_atac, model=model, idx1=a, idx2=b, mode='change', test_decoupling=True, save_raw=True)

    df_dd["log10_pscore"] = np.log10(df_dd["p_decoupling_no_change"])
    df_dd["gene_type"] = "Other"
    df_dd.loc['Satb2', "gene_type"] = 'Satb2'
    df_dd.loc['Robo2', "gene_type"] = 'Robo2'
    df_dd.loc['Gria2', "gene_type"] = 'Gria2'
    df_dd['gene_name'] = df_dd.index
    df_dd.loc[~np.isin(df_dd['gene_name'], ['Satb2', 'Robo2', 'Gria2']), 'gene_name'] = np.nan

    p = (
        p9.ggplot(df_dd, p9.aes("log2_diff_decoupling", "-log10_pscore", color="gene_type"))
        + p9.geom_point(
            df_dd.query("gene_type == 'Other'"), size=0.5, alpha=0.5, shape='.'
        )  # Plot other genes with transparence
        + p9.geom_point(df_dd.query("gene_type != 'Other'"))
        + p9.labs(x="Mean log2 decoupling factor (M1 --> M2)", y="Significance score")
        + p9.geom_text(p9.aes(label="gene_name"), nudge_y=0.08, nudge_x=-0.05, na_rm=True, size=10)
        + p9.scale_color_manual(values=['black', 'tab:red', 'tab:red', 'tab:red'])
        + p9.ggtitle("Differential decoupling")
        + p9.theme(legend_position='none')
    )
    print(p)

    df_dd["log10_pscore"] = np.log10(df_dd["p_coupling_no_change"])
    df_dd["gene_type"] = "Other"
    df_dd.loc['Satb2', "gene_type"] = 'Satb2'
    df_dd.loc['Robo2', "gene_type"] = 'Robo2'
    df_dd.loc['Gria2', "gene_type"] = 'Gria2'
    df_dd['gene_name'] = df_dd.index
    df_dd.loc[~np.isin(df_dd['gene_name'], ['Satb2', 'Robo2', 'Gria2']), 'gene_name'] = np.nan

    p = (
        p9.ggplot(df_dd, p9.aes("log2_diff_coupling", "-log10_pscore", color="gene_type"))
        + p9.geom_point(
            df_dd.query("gene_type == 'Other'"), size=0.5, alpha=0.5, shape='.'
        )  # Plot other genes with transparence
        + p9.geom_point(df_dd.query("gene_type != 'Other'"))
        + p9.labs(x="Mean log2 coupling factor (repression --> induction)", y="Significance score")
        + p9.geom_text(p9.aes(label="gene_name"), nudge_y=0.08, nudge_x=-0.1, na_rm=True, size=10)
        + p9.scale_color_manual(values=['black', 'tab:red', 'tab:red', 'tab:red'])
        + p9.ggtitle("Differential coupling")
        + p9.theme(legend_position='none')
    )
    print(p)

[29]:

# Plot volcano plots for each comparison
for a, b in zip(a_list, b_list):
    print(f"Comparing {a} vs {b}")
    a = np.isin(adata.obs['leiden'], a)
    b = np.isin(adata.obs['leiden'], b)
    differential_func(adata, adata_atac, a, b)

Comparing ['V-SVZ'] vs ['nIPC', 'Upper Layer']


Comparing ['Upper Layer'] vs ['V-SVZ', 'Deeper Layer']


Comparing ['Upper Layer', 'Deeper Layer'] vs ['V-SVZ']


Comparing ['Deeper Layer'] vs ['V-SVZ', 'Upper Layer']

Distributions of inferred rate parameters

[30]:

fig, axs = plt.subplots(1, 6, figsize=(40, 5))
axs[0].hist(adata.var[f'{key}_alpha_c'], bins=100);
axs[1].hist(adata.var[f'{key}_alpha'], bins=100);
axs[2].hist(adata.var[f'{key}_beta'], bins=100);
axs[3].hist(adata.var[f'{key}_gamma'], bins=100);
axs[4].hist(adata.var[f'{key}_scaling_c'], bins=100);
axs[5].hist(adata.var[f'{key}_scaling_u'], bins=100);

Save the final result

[31]:

adata.write_h5ad(data_path_base+"/final.h5ad")

[ ]: