Integration of two multi-omic datasets with (de)coupling
[1]:
import numpy as np
import scanpy as sc
import scvelo as scv
import torch
import matplotlib.pyplot as plt
import umap
import os
import multivelovae as vv
from datetime import datetime
[2]:
torch.cuda.is_available()
[2]:
True
[3]:
now = datetime.now()
date = now.strftime("%m_%d_%Y")
Read in processed data and define places to store output
[4]:
adata = sc.read_h5ad('3423-MV-2-8489-MV-1_adata_postpro_concat.h5ad')
adata_atac = sc.read_h5ad('3423-MV-2-8489-MV-1_adata_atac_postpro_concat.h5ad')
[5]:
gene_plot = ['AZU1', 'HBD', 'HDC', 'LYZ', 'PF4', 'SPINK2']
np.isin(gene_plot, adata.var_names)
[5]:
array([ True, True, True, True, True, True])
[6]:
'PROM1' in adata.var_names
[6]:
True
[7]:
dataset = "3423-MV-2-8489-MV-1"
model_path_base = f"checkpoints/{dataset}"
figure_path_base = f"figures/{dataset}"
data_path_base = f"data/{dataset}"
[8]:
adata, adata_atac
[8]:
(AnnData object with n_obs × n_vars = 17667 × 892
obs: 'total_unspliced', 'total_spliced', 'log1p_total_unspliced', 'log1p_total_spliced', 'fraction_u', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_rb', 'log1p_total_counts_rb', 'pct_counts_rb', 'total_counts_hla', 'log1p_total_counts_hla', 'pct_counts_hla', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'initial_size_unspliced', 'initial_size_spliced', 'initial_size', 'n_counts', 'S_score', 'G2M_score', 'phase', 'CC_difference', 'total_unspliced2', 'total_spliced2', 'log1p_total_unspliced2', 'log1p_total_spliced2', 'fraction_u2', 'n_c', 'n_Mu', 'n_Ms', 'leiden', 'total_chromatin', 'total_unspliced3', 'total_spliced3', 'log1p_total_unspliced3', 'log1p_total_spliced3', 'fraction_u3', 'total_Mc', 'total_Mu', 'total_Ms', 'batch', 'n_Mc'
var: 'gene_ids', 'feature_types', 'mt', 'rb', 'hla', 'hb', 'HVG-Sample 1', 'n_cells-Sample 1', 'n_cells_by_counts-Sample 1', 'mean_counts-Sample 1', 'log1p_mean_counts-Sample 1', 'pct_dropout_by_counts-Sample 1', 'total_counts-Sample 1', 'log1p_total_counts-Sample 1', 'gene_count_corr-Sample 1', 'highly_variable-Sample 1', 'mean-Sample 1', 'std-Sample 1', 'HVG-Sample 2', 'n_cells-Sample 2', 'n_cells_by_counts-Sample 2', 'mean_counts-Sample 2', 'log1p_mean_counts-Sample 2', 'pct_dropout_by_counts-Sample 2', 'total_counts-Sample 2', 'log1p_total_counts-Sample 2', 'gene_count_corr-Sample 2', 'highly_variable-Sample 2', 'mean-Sample 2', 'std-Sample 2'
uns: 'batch_colors', 'leiden_colors', 'neighbors', 'umap'
obsm: 'X_pca', 'X_umap'
layers: 'Ms', 'Mu', 'spliced', 'spliced_raw', 'unspliced', 'unspliced_raw'
obsp: 'connectivities', 'distances',
AnnData object with n_obs × n_vars = 17667 × 892
obs: 'n_counts', 'batch'
layers: 'Mc', 'chromatin_raw')
[9]:
adata.obs['leiden'].cat.categories
[9]:
Index(['CMP', 'DC', 'Erythrocyte', 'GMP', 'Granulocyte', 'HSC', 'LMPP', 'MEP',
'Mast Cells', 'Megakaryocyte', 'Platelet', 'Prog DC', 'Prog MK'],
dtype='object')
[10]:
adata.obs['batch'].cat.categories
[10]:
Index(['Sample 1', 'Sample 2'], dtype='object')
[11]:
os.makedirs(figure_path_base, exist_ok=True)
scv.pl.scatter(adata, basis='umap', color='leiden', legend_loc='right margin')
scv.pl.scatter(adata, basis='umap', color='batch', legend_loc='right margin')
[12]:
figure_path = figure_path_base+'/'+date
model_path = model_path_base+'/'+date
data_path = data_path_base
Initialize and train a MultiVeloVAE
[13]:
key = "vae"
[14]:
torch.manual_seed(2022)
np.random.seed(2022)
model = vv.VAEChrom(adata,
adata_atac,
batch_key='batch',
ref_batch=1,
batch_hvg_key='highly_variable',
var_to_regress=['total_unspliced3'],
device='cuda:0',
plot_init=False,
gene_plot=gene_plot,
cluster_key="leiden",
figure_path=figure_path,
embed="umap")
model.train(plot=False,
gene_plot=gene_plot,
figure_path=figure_path,
embed="umap")
model.save_model(model_path)
model.save_anndata(data_path, file_name="out.h5ad")
CVAE enabled. Performing batch effect correction.
Reference batch set to 1 (Sample 2).
Latent dimension set to 8.
KL weights set to 2.00 2.00.
Found ['total_unspliced3'] in obs. Regressing them out.
Batch 0 sparsity: 0.118
Batch 1 sparsity: 0.139
Learning rate set to 7.4e-4 based on data sparsity.
Early stop threshold set to 1.5.
Using Gaussian Prior.
Initializing using the steady-state and dynamical models.
859 out of 892 = 96.3% genes have good ellipse fits.
KS-test result: [1. 1. 1. 1. 1. 1. 1.]
Assigning cluster 6 to repressive.
Initial induction: 690, repression: 202 out of 892.
Computing scaling factors for each batch class.
-------------------------- Train a MultiVeloVAE -------------------------
********* Creating Training and Validation Datasets *********
Total Number of Iterations Per Epoch: 49, test iteration: 96
********* Finished. *********
********* Stage 1 *********
Epoch 1: Train ELBO = 122.456, Test ELBO = -58430.989 Total Time = 0 h : 0 m : 0 s
Epoch 50: Train ELBO = 1501.256, Test ELBO = 1497.321 Total Time = 0 h : 0 m : 16 s
********* Stage 1: Early Stop Triggered at epoch 83. *********
********* Retrieving best model from iteration 3745. *********
********* Stage 2 *********
Cell-wise KNN estimation.
Using 618 latent neighbors to select ancestors.
Percentage of Invalid Sets: 0.000
Average Set Size: 80
Finished. Actual Time: 0 h : 0 m : 15 s
********* Velocity Refinement Round 1 *********
Epoch 92: Train ELBO = 1502.394, Test ELBO = 1498.219 Total Time = 0 h : 0 m : 44 s
********* Round 1: Early Stop Triggered at epoch 92. *********
********* Retrieving best model from iteration 4450. *********
Cell-wise KNN estimation.
Finished. Actual Time: 0 h : 0 m : 2 s
********* Velocity Refinement Round 2 *********
Epoch 96: Train ELBO = 1479.619, Test ELBO = 1475.280 Total Time = 0 h : 0 m : 48 s
********* Round 2: Early Stop Triggered at epoch 96. *********
********* Retrieving best model from iteration 4642. *********
Change in x0: 0.181
Cell-wise KNN estimation.
Finished. Actual Time: 0 h : 0 m : 2 s
********* Velocity Refinement Round 3 *********
Epoch 100: Train ELBO = 1454.875, Test ELBO = 1451.734 Total Time = 0 h : 0 m : 52 s
********* Round 3: Early Stop Triggered at epoch 100. *********
********* Retrieving best model from iteration 4834. *********
Change in x0: 0.140
Cell-wise KNN estimation.
Finished. Actual Time: 0 h : 0 m : 2 s
********* Velocity Refinement Round 4 *********
Epoch 104: Train ELBO = 1436.807, Test ELBO = 1434.346 Total Time = 0 h : 0 m : 55 s
********* Round 4: Early Stop Triggered at epoch 104. *********
********* Retrieving best model from iteration 5026. *********
Change in x0: 0.116
Cell-wise KNN estimation.
Finished. Actual Time: 0 h : 0 m : 2 s
********* Velocity Refinement Round 5 *********
Epoch 108: Train ELBO = 1425.287, Test ELBO = 1423.135 Total Time = 0 h : 0 m : 59 s
********* Round 5: Early Stop Triggered at epoch 108. *********
********* Retrieving best model from iteration 5218. *********
Change in x0: 0.097
********* Stage 2: Early Stop Triggered at round 5. *********
Final: Train ELBO = 1425.287, Test ELBO = 1423.135
********* Finished. Total Time = 0 h : 1 m : 0 s *********
Using reference batch for latent variable computation.
Computing velocity.
Selected 726 velocity genes.
Writing anndata output to file.
Downstream analyses
[15]:
z = adata.obsm[f'{key}_z']
t = adata.obs[f'{key}_time'].to_numpy()
rho = adata.layers[f'{key}_rho']
kc = adata.layers[f'{key}_kc']
[16]:
# UMAP embedding from latent cell space using reference label
umap_obj = umap.UMAP(n_neighbors=30, n_components=2, min_dist=0.25, random_state=2022)
z_umap = umap_obj.fit_transform(z)
# UMAP embedding from latent cell space using original batch labels
z_batch = adata.obsm[f'{key}_z_batch']
umap_obj = umap.UMAP(n_neighbors=30, n_components=2, min_dist=0.25, random_state=2022)
z_umap2 = umap_obj.fit_transform(z_batch)
[17]:
scv.pl.scatter(adata, basis='umap', color=['leiden', 'batch', f'{key}_time'], legend_loc='right', color_map='gnuplot', ncols=1, size=30)
adata.obsm['X_z_umap'] = z_umap
scv.pl.scatter(adata, basis='z_umap', color=['leiden', 'batch', f'{key}_time'], legend_loc='right', color_map='gnuplot', ncols=1, size=30, save=f'{figure_path}/{dataset}_z_batch_time_{key}.png')
adata.obsm['X_z_umap2'] = z_umap2
scv.pl.scatter(adata, basis='z_umap2', color=['leiden', 'batch', f'{key}_time'], legend_loc='right', color_map='gnuplot', ncols=1, size=30, save=f'{figure_path}/{dataset}_z_batch_time2_{key}.png')
saving figure to file figures/3423-MV-2-8489-MV-1/06_04_2025/3423-MV-2-8489-MV-1_z_batch_time_vae.png
saving figure to file figures/3423-MV-2-8489-MV-1/06_04_2025/3423-MV-2-8489-MV-1_z_batch_time2_vae.png
[18]:
adata.obsm[f'{key}_latent'] = np.concatenate((adata.obsm[f'{key}_z'], adata.obsm[f'{key}_t']), axis=1)
sc.pp.neighbors(adata, use_rep=f'{key}_latent', n_neighbors=50)
[19]:
# Compute velocity graph on Z UMAP and visualize it, together with batch labels and inferred latent time
adata.obsm['X_z_umap'] = z_umap
vv.model.velocity_graph(adata, key=key, batch_corrected=True)
vv.velocity_embedding_stream(adata, key=key, basis='z_umap', color='leiden', title="", figsize=(8,6), legend_loc='right margin')
vv.velocity_embedding_stream(adata, key=key, basis='z_umap', color='batch', title="", figsize=(8,6), legend_loc='right margin')
vv.velocity_embedding_stream(adata, key=key, basis='z_umap', color=f'{key}_time', color_map='gnuplot', title="", figsize=(8,6), legend_loc='right margin')
computing velocity graph (using 1/8 cores)
finished (0:00:49) --> added
'vae_velocity_norm_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity embedding
finished (0:00:02) --> added
'vae_velocity_norm_z_umap', embedded velocity vectors (adata.obsm)
[20]:
# HSC marker gene CD133 (PROM1)
scv.pl.scatter(adata, color='PROM1', basis='z_umap', title='CD133')
[21]:
sc.pl.scatter(adata, basis='z_umap', layers=f'{key}_shat', color=['ANK1', 'CPA3', 'MPO', 'SULF2', 'VWF'])
[22]:
# Phase portraits of reconstructed unspliced and spliced levels
scv.pl.scatter(adata, basis=['ANK1', 'CPA3', 'MPO', 'SULF2', 'VWF'], color='batch', ncols=5, frameon=False, size=20, fontsize=20)
scv.pl.scatter(adata, basis=['ANK1', 'CPA3', 'MPO', 'SULF2', 'VWF'], x=f'{key}_shat', y=f'{key}_uhat', color='batch', ncols=5, frameon=False, size=20, fontsize=20)
scv.pl.scatter(adata, basis=['ANK1', 'CPA3', 'MPO', 'SULF2', 'VWF'], x=f'{key}_shat', y=f'{key}_uhat', color='leiden', ncols=5, frameon=False, size=20, fontsize=20)
[23]:
# Dynamic plots of modality level by latent time for genes of interest
for gene in ['ANK1', 'CPA3', 'MPO', 'SULF2', 'VWF']:
scv.pl.scatter(adata, x=adata.obs[f'{key}_time'], y=adata[:, gene].layers[f'{key}_shat'], color='leiden', figsize=(6,4), title=f'{gene}', legend_loc='none', frameon=False, fontsize=20)
Decoupling and coupling factors of cells
[24]:
import matplotlib.colors as mcolors
[25]:
adata.layers[f'{key}_decoupling'] = adata.layers[f'{key}_kc'] - adata.layers[f'{key}_rho']
adata.layers[f'{key}_coupling'] = adata.layers[f'{key}_kc'] + adata.layers[f'{key}_rho'] - 1
[26]:
# Decouping factors of each modality on dynamic plots
scv.pl.scatter(adata, basis=['AZU1', 'HBD', 'HDC', 'LYZ', 'PF4', 'SPINK2'], x=f'{key}_time', y=f'{key}_chat', color=f'{key}_decoupling', dpi=300, legend_loc='none', frameon=False, fontsize=30, color_map='coolwarm', size=40,
norm=mcolors.CenteredNorm(halfrange=1), colorbar=False, wspace=0.1)
scv.pl.scatter(adata, basis=['AZU1', 'HBD', 'HDC', 'LYZ', 'PF4', 'SPINK2'], x=f'{key}_time', y=f'{key}_uhat', color=f'{key}_decoupling', dpi=300, legend_loc='none', frameon=False, fontsize=30, color_map='coolwarm', size=40,
norm=mcolors.CenteredNorm(halfrange=1), colorbar=False, wspace=0.1)
scv.pl.scatter(adata, basis=['AZU1', 'HBD', 'HDC', 'LYZ', 'PF4', 'SPINK2'], x=f'{key}_time', y=f'{key}_shat', color=f'{key}_decoupling', dpi=300, legend_loc='none', frameon=False, fontsize=30, color_map='coolwarm', size=40,
norm=mcolors.CenteredNorm(halfrange=1), colorbar=False, wspace=0.1)
[27]:
# Couping factors of each modality on dynamic plots
scv.pl.scatter(adata, basis=['AZU1', 'HBD', 'HDC', 'LYZ', 'PF4', 'SPINK2'], x=f'{key}_time', y=f'{key}_chat', color=f'{key}_coupling', dpi=300, legend_loc='none', frameon=False, fontsize=30, color_map='coolwarm', size=40,
norm=mcolors.CenteredNorm(halfrange=1), colorbar=False, wspace=0.1)
scv.pl.scatter(adata, basis=['AZU1', 'HBD', 'HDC', 'LYZ', 'PF4', 'SPINK2'], x=f'{key}_time', y=f'{key}_uhat', color=f'{key}_coupling', dpi=300, legend_loc='none', frameon=False, fontsize=30, color_map='coolwarm', size=40,
norm=mcolors.CenteredNorm(halfrange=1), colorbar=False, wspace=0.1)
scv.pl.scatter(adata, basis=['AZU1', 'HBD', 'HDC', 'LYZ', 'PF4', 'SPINK2'], x=f'{key}_time', y=f'{key}_shat', color=f'{key}_coupling', dpi=300, legend_loc='none', frameon=False, fontsize=30, color_map='coolwarm', size=40,
norm=mcolors.CenteredNorm(halfrange=1), colorbar=False, wspace=0.1)
Plot phase portraits of genes with the highest likelihoods
[28]:
top_genes = adata[:, adata.var[f'{key}_velocity_genes']].var[f'{key}_likelihood'].sort_values(ascending=False).index
top_genes = top_genes[~np.isin(top_genes, ['ANK1', 'CPA3', 'MPO', 'SULF2', 'VWF'])]
[29]:
scv.pl.scatter(adata, basis=top_genes[:20], color='leiden', ncols=5, frameon=False, fontsize=20, size=30)
scv.pl.scatter(adata, basis=top_genes[:20], x=f'{key}_shat', y=f'{key}_uhat', color='leiden', ncols=5, frameon=False, fontsize=20, size=30)
[30]:
## Phase portraits compare before and after integration
scv.pl.scatter(adata, basis=top_genes[:20], color='batch', ncols=5, frameon=False, fontsize=20, size=30)
scv.pl.scatter(adata, basis=top_genes[:20], x=f'{key}_shat', y=f'{key}_uhat', color='batch', ncols=5, frameon=False, fontsize=20, size=30)
[31]:
adata.layers['Mc'] = adata_atac.layers['Mc'].copy()
[32]:
scv.pl.scatter(adata, basis=top_genes[:20], x='Mu', y='Mc', color='leiden', ncols=5, frameon=False, fontsize=20, size=30)
scv.pl.scatter(adata, basis=top_genes[:20], x=f'{key}_uhat', y=f'{key}_chat', color='leiden', ncols=5, frameon=False, fontsize=20, size=30)
[33]:
scv.pl.scatter(adata, basis=top_genes[:20], x='Mu', y='Mc', color='batch', ncols=5, frameon=False, fontsize=20, size=30)
scv.pl.scatter(adata, basis=top_genes[:20], x=f'{key}_uhat', y=f'{key}_chat', color='batch', ncols=5, frameon=False, fontsize=20, size=30)
[34]:
# Dynamic plots of several marker genes colored by batch labels
vv.dynamic_plot(adata, adata_atac, gene_plot, color_by='batch', batch_correction=True)
Distributions of inferred rate parameters
[35]:
fig, axs = plt.subplots(2, 2, figsize=(15, 12))
axs[0, 0].hist(adata.var[f'{key}_alpha_c_0'], bins=100, alpha=0.6);
axs[0, 0].hist(adata.var[f'{key}_alpha_c_1'], bins=100, alpha=0.6);
axs[0, 0].set_title('alpha_c');
a1 = adata.var[f'{key}_alpha_0'].values
a1 = a1[a1 < np.percentile(a1, 99.5)]
a2 = adata.var[f'{key}_alpha_1'].values
a2 = a2[a2 < np.percentile(a2, 99.5)]
axs[0, 1].hist(a1, bins=100, alpha=0.6);
axs[0, 1].hist(a2, bins=100, alpha=0.6);
axs[0, 1].set_title('alpha');
axs[1, 0].hist(adata.var[f'{key}_beta_0'], bins=100, alpha=0.6);
axs[1, 0].hist(adata.var[f'{key}_beta_1'], bins=100, alpha=0.6);
axs[1, 0].set_title('beta');
axs[1, 1].hist(adata.var[f'{key}_gamma_0'], bins=100, alpha=0.6);
axs[1, 1].hist(adata.var[f'{key}_gamma_1'], bins=100, alpha=0.6);
axs[1, 1].set_title('gamma');
Save the final result
[36]:
adata.write_h5ad(data_path_base+"/final.h5ad")
[ ]: