Integration of two multi-omic datasets with differential test
[1]:
import numpy as np
import scanpy as sc
import scvelo as scv
import torch
import matplotlib.pyplot as plt
import umap
import os
import multivelovae as vv
from datetime import datetime
import seaborn as sns
import pandas as pd
import plotnine as p9
[2]:
torch.cuda.is_available()
[2]:
True
[3]:
now = datetime.now()
date = now.strftime("%m_%d_%Y")
Read in processed data and define places to store output
[4]:
adata = sc.read_h5ad('8489-MV-1-9060-MV-3_adata_postpro_concat.h5ad')
adata_atac = sc.read_h5ad('8489-MV-1-9060-MV-3_adata_atac_postpro_concat.h5ad')
[5]:
gene_plot = ['AZU1', 'HBD', 'HDC', 'LYZ', 'PF4', 'SPINK2', 'CTSS', 'SPP1']
np.isin(gene_plot, adata.var_names)
[5]:
array([ True, True, True, True, True, True, True, True])
[6]:
'PROM1' in adata.var_names
[6]:
True
[7]:
dataset = "8489-MV-1-9060-MV-3"
model_path_base = f"checkpoints/{dataset}"
figure_path_base = f"figures/{dataset}"
data_path_base = f"data/{dataset}"
[8]:
adata, adata_atac
[8]:
(AnnData object with n_obs × n_vars = 9908 × 929
obs: 'total_unspliced', 'total_spliced', 'log1p_total_unspliced', 'log1p_total_spliced', 'fraction_u', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_rb', 'log1p_total_counts_rb', 'pct_counts_rb', 'total_counts_hla', 'log1p_total_counts_hla', 'pct_counts_hla', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'initial_size_unspliced', 'initial_size_spliced', 'initial_size', 'n_counts', 'S_score', 'G2M_score', 'phase', 'CC_difference', 'total_unspliced2', 'total_spliced2', 'log1p_total_unspliced2', 'log1p_total_spliced2', 'fraction_u2', 'n_c', 'n_Mu', 'n_Ms', 'leiden', 'total_chromatin', 'total_unspliced3', 'total_spliced3', 'log1p_total_unspliced3', 'log1p_total_spliced3', 'fraction_u3', 'total_Mc', 'total_Mu', 'total_Ms', 'batch', 'n_Mc'
var: 'gene_ids', 'feature_types', 'mt', 'rb', 'hla', 'hb', 'HVG-Day 14', 'n_cells-Day 14', 'n_cells_by_counts-Day 14', 'mean_counts-Day 14', 'log1p_mean_counts-Day 14', 'pct_dropout_by_counts-Day 14', 'total_counts-Day 14', 'log1p_total_counts-Day 14', 'gene_count_corr-Day 14', 'highly_variable-Day 14', 'mean-Day 14', 'std-Day 14', 'HVG-Day 7', 'n_cells-Day 7', 'n_cells_by_counts-Day 7', 'mean_counts-Day 7', 'log1p_mean_counts-Day 7', 'pct_dropout_by_counts-Day 7', 'total_counts-Day 7', 'log1p_total_counts-Day 7', 'gene_count_corr-Day 7', 'highly_variable-Day 7', 'mean-Day 7', 'std-Day 7'
uns: 'batch_colors', 'leiden_colors', 'neighbors', 'umap'
obsm: 'X_pca', 'X_umap'
layers: 'Ms', 'Mu', 'counts', 'spliced', 'unspliced'
obsp: 'connectivities', 'distances',
AnnData object with n_obs × n_vars = 9908 × 929
obs: 'n_counts', 'batch'
layers: 'Mc')
[9]:
adata.obs['leiden'].cat.categories
[9]:
Index(['CMP', 'DC', 'Erythrocyte', 'GMP', 'Granulocyte', 'HSC', 'LMPP',
'M1 Macrophage', 'M2 Macrophage', 'MDP', 'MEP', 'Mast Cells',
'Megakaryocyte', 'Monocyte', 'Platelet', 'Prog DC', 'Prog MK'],
dtype='object')
[10]:
adata.obs['batch'].cat.categories
[10]:
Index(['Day 7', 'Day 14'], dtype='object')
[11]:
os.makedirs(figure_path_base, exist_ok=True)
scv.pl.scatter(adata, basis='umap', color='leiden', legend_loc='right margin')
scv.pl.scatter(adata, basis='umap', color='batch', legend_loc='right margin')
[12]:
figure_path = figure_path_base+'/'+date
model_path = model_path_base+'/'+date
data_path = data_path_base
Initialize and train a MultiVeloVAE
[13]:
key = "vae"
[14]:
torch.manual_seed(2022)
np.random.seed(2022)
model = vv.VAEChrom(adata,
adata_atac,
batch_key='batch',
ref_batch=0,
batch_hvg_key='highly_variable',
var_to_regress=['total_unspliced3'],
device='cuda:0',
plot_init=False,
gene_plot=gene_plot,
cluster_key="leiden",
figure_path=figure_path,
embed="umap")
model.train(plot=False,
gene_plot=gene_plot,
figure_path=figure_path,
embed="umap")
model.save_model(model_path)
model.save_anndata(data_path, file_name="out.h5ad")
CVAE enabled. Performing batch effect correction.
Reference batch set to 0 (Day 7).
Latent dimension set to 8.
KL weights set to 2.00 2.00.
Found ['total_unspliced3'] in obs. Regressing them out.
Batch 0 sparsity: 0.122
Batch 1 sparsity: 0.106
Learning rate set to 7.7e-4 based on data sparsity.
Early stop threshold set to 1.6.
Using Gaussian Prior.
Initializing using the steady-state and dynamical models.
849 out of 929 = 91.4% genes have good ellipse fits.
KS-test result: [1. 1. 1. 1. 1. 1. 1.]
Assigning cluster 2 to repressive.
Initial induction: 714, repression: 215 out of 929.
Computing scaling factors for each batch class.
-------------------------- Train a MultiVeloVAE -------------------------
********* Creating Training and Validation Datasets *********
Total Number of Iterations Per Epoch: 28, test iteration: 54
********* Finished. *********
********* Stage 1 *********
Epoch 1: Train ELBO = -2688.083, Test ELBO = -49984.745 Total Time = 0 h : 0 m : 0 s
Epoch 50: Train ELBO = 2096.158, Test ELBO = 2080.189 Total Time = 0 h : 0 m : 9 s
Epoch 100: Train ELBO = 2157.434, Test ELBO = 2143.381 Total Time = 0 h : 0 m : 18 s
Epoch 150: Train ELBO = 2176.633, Test ELBO = 2162.807 Total Time = 0 h : 0 m : 27 s
********* Stage 1: Early Stop Triggered at epoch 199. *********
********* Retrieving best model from iteration 5455. *********
********* Stage 2 *********
Cell-wise KNN estimation.
Using 346 latent neighbors to select ancestors.
Percentage of Invalid Sets: 0.002
Average Set Size: 44
Finished. Actual Time: 0 h : 0 m : 7 s
********* Velocity Refinement Round 1 *********
Epoch 210: Train ELBO = 2116.457, Test ELBO = 2103.332 Total Time = 0 h : 0 m : 46 s
********* Round 1: Early Stop Triggered at epoch 210. *********
********* Retrieving best model from iteration 5852. *********
Cell-wise KNN estimation.
Finished. Actual Time: 0 h : 0 m : 1 s
********* Velocity Refinement Round 2 *********
Epoch 216: Train ELBO = 2058.629, Test ELBO = 2046.996 Total Time = 0 h : 0 m : 48 s
********* Round 2: Early Stop Triggered at epoch 216. *********
********* Retrieving best model from iteration 6014. *********
Change in x0: 0.194
Cell-wise KNN estimation.
Finished. Actual Time: 0 h : 0 m : 1 s
********* Velocity Refinement Round 3 *********
Epoch 220: Train ELBO = 1996.865, Test ELBO = 1984.590 Total Time = 0 h : 0 m : 50 s
********* Round 3: Early Stop Triggered at epoch 220. *********
********* Retrieving best model from iteration 6122. *********
Change in x0: 0.158
Cell-wise KNN estimation.
Finished. Actual Time: 0 h : 0 m : 1 s
********* Velocity Refinement Round 4 *********
Epoch 224: Train ELBO = 1942.320, Test ELBO = 1931.397 Total Time = 0 h : 0 m : 53 s
********* Round 4: Early Stop Triggered at epoch 224. *********
********* Retrieving best model from iteration 6230. *********
Change in x0: 0.135
Cell-wise KNN estimation.
Finished. Actual Time: 0 h : 0 m : 1 s
********* Velocity Refinement Round 5 *********
Epoch 228: Train ELBO = 1901.485, Test ELBO = 1891.341 Total Time = 0 h : 0 m : 55 s
********* Round 5: Early Stop Triggered at epoch 228. *********
********* Retrieving best model from iteration 6338. *********
Change in x0: 0.118
Cell-wise KNN estimation.
Finished. Actual Time: 0 h : 0 m : 1 s
********* Velocity Refinement Round 6 *********
Epoch 232: Train ELBO = 1876.911, Test ELBO = 1865.643 Total Time = 0 h : 0 m : 57 s
********* Round 6: Early Stop Triggered at epoch 232. *********
********* Retrieving best model from iteration 6446. *********
Change in x0: 0.102
Final: Train ELBO = 1876.911, Test ELBO = 1865.643
********* Finished. Total Time = 0 h : 0 m : 57 s *********
Using reference batch for latent variable computation.
Computing velocity.
Selected 643 velocity genes.
Writing anndata output to file.
Downstream analyses
[15]:
z = adata.obsm[f'{key}_z']
t = adata.obs[f'{key}_time'].to_numpy()
rho = adata.layers[f'{key}_rho']
kc = adata.layers[f'{key}_kc']
[16]:
# UMAP embedding from latent cell space using reference label
umap_obj = umap.UMAP(n_neighbors=30, n_components=2, min_dist=0.25, random_state=2022)
z_umap = umap_obj.fit_transform(z)
# UMAP embedding from latent cell space using original batch labels
z_batch = adata.obsm[f'{key}_z_batch']
umap_obj = umap.UMAP(n_neighbors=30, n_components=2, min_dist=0.25, random_state=2022)
z_umap2 = umap_obj.fit_transform(z_batch)
[17]:
scv.pl.scatter(adata, basis='umap', color=['leiden', 'batch', f'{key}_time'], legend_loc='right', color_map='gnuplot', ncols=1, size=30)
adata.obsm['X_z_umap'] = z_umap
scv.pl.scatter(adata, basis='z_umap', color=['leiden', 'batch', f'{key}_time'], legend_loc='right', color_map='gnuplot', ncols=1, size=30, save=f'{figure_path}/{dataset}_z_batch_time_{key}.png')
adata.obsm['X_z_umap2'] = z_umap2
scv.pl.scatter(adata, basis='z_umap2', color=['leiden', 'batch', f'{key}_time'], legend_loc='right', color_map='gnuplot', ncols=1, size=30, save=f'{figure_path}/{dataset}_z_batch_time2_{key}.png')
saving figure to file figures/8489-MV-1-9060-MV-3/06_04_2025/8489-MV-1-9060-MV-3_z_batch_time_vae.png
saving figure to file figures/8489-MV-1-9060-MV-3/06_04_2025/8489-MV-1-9060-MV-3_z_batch_time2_vae.png
[18]:
adata.obsm[f'{key}_latent'] = np.concatenate((adata.obsm[f'{key}_z'], adata.obsm[f'{key}_t']), axis=1)
sc.pp.neighbors(adata, use_rep=f'{key}_latent', n_neighbors=50)
[19]:
# Compute velocity graph on Z UMAP and visualize it, together with batch labels and inferred latent time
adata.obsm['X_z_umap'] = z_umap
vv.model.velocity_graph(adata, key=key, batch_corrected=True)
vv.velocity_embedding_stream(adata, key=key, basis='z_umap', color='leiden', title="", figsize=(8,6), legend_loc='right margin')
vv.velocity_embedding_stream(adata, key=key, basis='z_umap', color='batch', title="", figsize=(8,6), legend_loc='right margin')
vv.velocity_embedding_stream(adata, key=key, basis='z_umap', color=f'{key}_time', color_map='gnuplot', title="", figsize=(8,6), legend_loc='right margin')
computing velocity graph (using 1/8 cores)
finished (0:00:15) --> added
'vae_velocity_norm_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity embedding
finished (0:00:00) --> added
'vae_velocity_norm_z_umap', embedded velocity vectors (adata.obsm)
[20]:
# HSC marker gene CD133 (PROM1)
scv.pl.scatter(adata, color='PROM1', basis='z_umap', title='CD133')
Plot phase portraits of genes with the highest likelihoods
[21]:
top_genes = adata[:, adata.var[f'{key}_velocity_genes']].var[f'{key}_likelihood'].sort_values(ascending=False).index
[22]:
scv.pl.scatter(adata, basis=top_genes[:20], color='leiden', ncols=5, frameon=False, fontsize=20, size=30)
scv.pl.scatter(adata, basis=top_genes[:20], x=f'{key}_shat', y=f'{key}_uhat', color='leiden', ncols=5, frameon=False, fontsize=20, size=30)
[23]:
scv.pl.scatter(adata, basis=top_genes[:20], color='batch', ncols=5, frameon=False, fontsize=20, size=30)
scv.pl.scatter(adata, basis=top_genes[:20], x=f'{key}_shat', y=f'{key}_uhat', color='batch', ncols=5, frameon=False, fontsize=20, size=30)
Plot phase portraits of selected marker genes
[24]:
scv.pl.scatter(adata, basis=gene_plot, color='leiden', ncols=5, frameon=False, fontsize=20, size=30)
scv.pl.scatter(adata, basis=gene_plot, x=f'{key}_shat', y=f'{key}_uhat', color='leiden', ncols=5, frameon=False, fontsize=20, size=30)
[25]:
scv.pl.scatter(adata, basis=gene_plot, color='batch', ncols=5, frameon=False, fontsize=20, size=30)
scv.pl.scatter(adata, basis=gene_plot, x=f'{key}_shat', y=f'{key}_uhat', color='batch', ncols=5, frameon=False, fontsize=20, size=30)
[26]:
# Dynamic plots of several marker genes
vv.dynamic_plot(adata, adata_atac, gene_plot, color_by='leiden', batch_correction=True)
[27]:
vv.dynamic_plot(adata, adata_atac, gene_plot, color_by='batch', batch_correction=True)
[28]:
# Latent time distribution of two datasets
sns.violinplot(y="batch", x=f'{key}_time', data=adata.obs, orient='h');
Distributions of inferred rate parameters
[29]:
fig, axs = plt.subplots(1, 4, figsize=(30, 5))
axs[0].hist(adata.var[f'{key}_alpha_c_0'], bins=100, alpha=0.6);
axs[0].hist(adata.var[f'{key}_alpha_c_1'], bins=100, alpha=0.6);
axs[0].set_title('alpha_c');
a1 = adata.var[f'{key}_alpha_0'].values
a1 = a1[a1 < np.percentile(a1, 99.5)]
a2 = adata.var[f'{key}_alpha_1'].values
a2 = a2[a2 < np.percentile(a2, 99.5)]
axs[1].hist(a1, bins=100, alpha=0.6);
axs[1].hist(a2, bins=100, alpha=0.6);
axs[1].set_title('alpha');
axs[2].hist(adata.var[f'{key}_beta_0'], bins=100, alpha=0.6);
axs[2].hist(adata.var[f'{key}_beta_1'], bins=100, alpha=0.6);
axs[2].set_title('beta');
axs[3].hist(adata.var[f'{key}_gamma_0'], bins=100, alpha=0.6);
axs[3].hist(adata.var[f'{key}_gamma_1'], bins=100, alpha=0.6);
axs[3].set_title('gamma');
[30]:
sns.set_theme(style="white", palette='tab10')
[31]:
adata_copy = adata.copy()
[32]:
adata_copy.obs['leiden'] = pd.Categorical(adata_copy.obs['leiden'], ['Mast Cells', 'Erythrocyte', 'Granulocyte', 'Platelet', 'Megakaryocyte', 'Prog MK', 'MEP', 'CMP', 'HSC', 'LMPP', 'GMP', 'MDP', 'Monocyte', 'Prog DC', 'DC', 'M1 Macrophage', 'M2 Macrophage'])
Distributions of cell types in two datasets
[33]:
fig, ax = plt.subplots(1, 1, figsize=(25, 6))
sns.histplot(
data=adata_copy.obs,
x="leiden",
hue="batch",
multiple="stack",
discrete=True,
shrink=.7
);
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
ax.tick_params(axis='x', rotation=35)
ax.set_xlabel('');
[34]:
adata_sub = adata[np.isin(adata.obs['leiden'], ['Erythrocyte', 'Granulocyte', 'Mast Cells', 'Platelet'])].copy()
[35]:
fig, ax = plt.subplots(1, 1, figsize=(15, 7))
sns.violinplot(data=adata_sub.obs, x="leiden", y=f"{key}_time", hue="batch", split=True, inner="quart", linewidth=1.5, ax=ax);
sns.move_legend(ax, "upper left")
ax.set_xlabel('');
ax.set_ylabel('Latent time');
[36]:
adata_sub = adata[np.isin(adata.obs['leiden'], ['Prog DC', 'Monocyte', 'DC', 'M1 Macrophage', 'M2 Macrophage'])].copy()
adata_sub.obs['leiden'] = pd.Categorical(adata_sub.obs['leiden'], ['Monocyte', 'M2 Macrophage', 'M1 Macrophage', 'Prog DC', 'DC'])
[37]:
colors = {x:y for x,y in zip(adata.obs['leiden'].cat.categories, adata.uns['leiden_colors'])}
[38]:
fig, ax = plt.subplots(1, 1, figsize=(15, 7))
sns.violinplot(data=adata_sub.obs, x="leiden", y=f"{key}_time", split=True, inner="quart", linewidth=1.5, palette=colors, ax=ax);
ax.set_xlabel('');
ax.set_ylabel('Latent time');
Differential dynamics
Compare Macrophage and DC
[39]:
adata_sub = adata[adata.obs['leiden'].isin(['DC', 'GMP', 'HSC', 'LMPP', 'MDP',
'M1 Macrophage', 'M2 Macrophage', 'Monocyte', 'Prog DC'])].copy()
adata_sub = adata_sub[adata_sub.obsm['X_z_umap'][:,0]<np.percentile(adata_sub.obsm['X_z_umap'][:,0], 99.5)].copy()
scv.pl.scatter(adata_sub, basis='z_umap', color='leiden', legend_loc='right', title='')
adata_atac_sub = adata_atac[adata_sub.obs_names,:]
[40]:
a = np.isin(adata.obs['leiden'], ['M1 Macrophage', 'M2 Macrophage'])
b = np.isin(adata.obs['leiden'], ['DC'])
[41]:
df_dd_ = vv.model.differential_dynamics(adata, adata_atac, model=model, batch_key='batch', idx1=a, idx2=b, mode='change')
Different batches found in group1 and group2. Sampling pairs regardless of batch conditions.
Using reference batch for latent variable computation.
Using reference batch for latent variable computation.
[42]:
df_dd_["log10_pscore"] = np.log10(df_dd_["p_v_no_change"])
df_dd_["gene_type"] = "Other"
df_dd_.loc[(df_dd_['log2_diff_v']>3).to_numpy() & (df_dd_['log10_pscore']<np.log10(0.05)).to_numpy(), "gene_type"] = 'LD>3&Pval<0.05'
df_dd_.loc[(df_dd_['log2_diff_v']<-3).to_numpy() & (df_dd_['log10_pscore']<np.log10(0.05)).to_numpy(), "gene_type"] = 'LD<-3&Pval<0.05'
df_dd_['gene_name'] = df_dd_.index
df_dd_.loc[~np.isin(df_dd_['gene_type'], ['LD>3&Pval<0.05', 'LD<-3&Pval<0.05']), 'gene_name'] = np.nan
[43]:
df_dd_.groupby(['gene_type']).size()
[43]:
gene_type
LD<-3&Pval<0.05 18
LD>3&Pval<0.05 43
Other 868
dtype: int64
[44]:
# Volcano plot of differential velocities
(
p9.ggplot(df_dd_, p9.aes("log2_diff_v", "-log10_pscore", color="gene_type"))
+ p9.geom_point(
df_dd_.query("gene_type == 'Other'"), alpha=0.5
) # Plot other genes with transparence
+ p9.geom_point(df_dd_.query("gene_type != 'Other'"))
+ p9.labs(x="Mean log difference", y="Significance score")
+ p9.theme_minimal()
)
[44]:
<ggplot: (8746974117973)>
[45]:
# Dynamic plots of C, U, and S of top differential velocity genes in macrophages
vv.dynamic_plot(adata_sub, adata_atac_sub, df_dd_.sort_values(by=['bayes_factor_v', 'log2_diff_v'], ascending=[False, False])[df_dd_['log2_diff_v']>0].index[:10], color_by='leiden', batch_correction=True)
[46]:
# Dynamic plots of VC, VU, and VS of top differential velocity genes in macrophages
vv.dynamic_plot(adata_sub, adata_atac_sub, df_dd_.sort_values(by=['bayes_factor_v', 'log2_diff_v'], ascending=[False, False])[df_dd_['log2_diff_v']>0].index[:10], color_by='leiden', by='velocity')
[47]:
import matplotlib.colors as mcolors
[48]:
adata_copy2 = adata_sub.copy()
adata_copy2.X = adata_copy2.layers[f'{key}_velocity'].copy()
adata_copy2.X = adata_copy2.X / np.max(np.abs(adata_copy2.X), axis=0)
[ ]:
# Velocity level scatter plots of top differential velocity genes in macrophages
scv.pl.scatter(adata_copy2, basis='z_umap', color=df_dd_.sort_values(by=['bayes_factor_v', 'log2_diff_v'], ascending=[False, False])[df_dd_['log2_diff_v']>0].index[:9], frameon=False, ncols=3, colorbar=False, wspace=0.1, hspace=0.1,
fontsize=30, color_map='coolwarm', norm=mcolors.CenteredNorm(halfrange=1))
[50]:
# Dynamic plots of C, U, and S of top differential velocity genes in DCs
vv.dynamic_plot(adata_sub, adata_atac_sub, df_dd_.sort_values(by=['bayes_factor_v', 'log2_diff_v'], ascending=[False, True])[df_dd_['log2_diff_v']<0].index[:10], color_by='leiden', batch_correction=True)
[51]:
# Dynamic plots of VC, VU, and VS of top differential velocity genes in DCs
vv.dynamic_plot(adata_sub, adata_atac_sub, df_dd_.sort_values(by=['bayes_factor_v', 'log2_diff_v'], ascending=[False, True])[df_dd_['log2_diff_v']<0].index[:10], color_by='leiden', by='velocity')
[ ]:
# Velocity level scatter plots of top differential velocity genes in DCs
scv.pl.scatter(adata_copy2, basis='z_umap', color=df_dd_.sort_values(by=['bayes_factor_v', 'log2_diff_v'], ascending=[False, True])[df_dd_['log2_diff_v']<0].index[:9], frameon=False, ncols=3, colorbar=False, wspace=0.1, hspace=0.1,
fontsize=30, color_map='coolwarm', norm=mcolors.CenteredNorm(halfrange=1))
Differential dynamics plots
[53]:
color_include = ['Monocyte', 'Prog DC', 'DC', 'M1 Macrophage', 'M2 Macrophage']
[54]:
df_dd_mac = vv.model.differential_dynamics(adata, adata_atac, model=model, batch_key='batch', idx1=a, idx2=b, mode='change', save_raw=True)
Different batches found in group1 and group2. Sampling pairs regardless of batch conditions.
Using reference batch for latent variable computation.
Using reference batch for latent variable computation.
[55]:
plt.hist(np.log(df_dd_mac['mean_c1']), bins=100, alpha=0.7);
plt.hist(np.log(df_dd_mac['mean_u1']), bins=100, alpha=0.7);
plt.hist(np.log(df_dd_mac['mean_s1']), bins=100, alpha=0.7);
[56]:
mac_ddg = df_dd_mac.sort_values(by='bayes_factor_v', ascending=False)[(df_dd_mac['log2_diff_v']>0)
& (np.log(df_dd_mac['mean_c1'])>-1.5)
& (np.log(df_dd_mac['mean_u1'])>-3)
& (np.log(df_dd_mac['mean_s1'])>-3)].index
len(mac_ddg)
[56]:
297
[57]:
mac_ddg
[57]:
Index(['RASGRP3', 'SLC37A1', 'GFOD1', 'KIFC3', 'JAKMIP2', 'ABCC3', 'ABCA1',
'ADAM10', 'MINDY2', 'SLC2A3',
...
'MINPP1', 'CDK6', 'MAN1A1', 'NUCB2', 'MYH10', 'PDZD8', 'CCDC171',
'TNFAIP8', 'NCOA4', 'CAT'],
dtype='object', length=297)
[58]:
gene = 'PROS1'
vv.differential_dynamics_plot(adata, gene, var='kc', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.differential_dynamics_plot(adata, gene, var='rho', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.differential_dynamics_plot(adata, gene, var='v', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.differential_dynamics_plot(adata, gene, var='s', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.dynamic_plot(adata_sub, adata_atac[adata_sub.obs_names,:], gene, color_by='leiden', by='velocity', axis_on=False, frame_on=False)
vv.dynamic_plot(adata_sub, adata_atac[adata_sub.obs_names,:], gene, color_by='leiden', batch_correction=True, axis_on=False, frame_on=False)
[59]:
gene = 'LGMN'
vv.differential_dynamics_plot(adata, gene, var='kc', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.differential_dynamics_plot(adata, gene, var='rho', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.differential_dynamics_plot(adata, gene, var='v', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.differential_dynamics_plot(adata, gene, var='s', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.dynamic_plot(adata_sub, adata_atac[adata_sub.obs_names,:], gene, color_by='leiden', by='velocity', axis_on=False, frame_on=False)
vv.dynamic_plot(adata_sub, adata_atac[adata_sub.obs_names,:], gene, color_by='leiden', batch_correction=True, axis_on=False, frame_on=False)
[60]:
gene = 'LGALS3'
vv.differential_dynamics_plot(adata, gene, var='kc', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.differential_dynamics_plot(adata, gene, var='rho', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.differential_dynamics_plot(adata, gene, var='v', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.differential_dynamics_plot(adata, gene, var='s', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.dynamic_plot(adata_sub, adata_atac[adata_sub.obs_names,:], gene, color_by='leiden', by='velocity', axis_on=False, frame_on=False)
vv.dynamic_plot(adata_sub, adata_atac[adata_sub.obs_names,:], gene, color_by='leiden', batch_correction=True, axis_on=False, frame_on=False)
[61]:
df_dd_dc = vv.model.differential_dynamics(adata, adata_atac, model=model, batch_key='batch', idx1=b, idx2=a, mode='change', save_raw=True)
Different batches found in group1 and group2. Sampling pairs regardless of batch conditions.
Using reference batch for latent variable computation.
Using reference batch for latent variable computation.
[62]:
plt.hist(np.log(df_dd_dc['mean_c1']), bins=100, alpha=0.7);
plt.hist(np.log(df_dd_dc['mean_u1']), bins=100, alpha=0.7);
plt.hist(np.log(df_dd_dc['mean_s1']), bins=100, alpha=0.7);
[63]:
dc_ddg = df_dd_dc.sort_values(by='bayes_factor_v', ascending=False)[(df_dd_dc['log2_diff_v']>0)
& (np.log(df_dd_dc['mean_c1'])>-1.5)
& (np.log(df_dd_dc['mean_u1'])>-3)
& (np.log(df_dd_dc['mean_s1'])>-3)].index
len(dc_ddg)
[63]:
307
[64]:
dc_ddg
[64]:
Index(['CACNB4', 'TCF4', 'CNR2', 'DAPP1', 'IRF8', 'OTULINL', 'BLNK', 'HERPUD1',
'HIP1', 'IPCEF1',
...
'LINC00662', 'LPXN', 'DIAPH1', 'AIG1', 'DEPTOR', 'LMO4', 'ATP9B',
'ZNF521', 'PRKAR2B', 'DDX10'],
dtype='object', length=307)
[65]:
gene = 'IRF8'
vv.differential_dynamics_plot(adata, gene, var='kc', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.differential_dynamics_plot(adata, gene, var='rho', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.differential_dynamics_plot(adata, gene, var='v', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.differential_dynamics_plot(adata, gene, var='s', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.dynamic_plot(adata_sub, adata_atac[adata_sub.obs_names,:], gene, color_by='leiden', by='velocity', axis_on=False, frame_on=False)
vv.dynamic_plot(adata_sub, adata_atac[adata_sub.obs_names,:], gene, color_by='leiden', batch_correction=True, axis_on=False, frame_on=False)
[66]:
gene = 'CSF2RB'
vv.differential_dynamics_plot(adata, gene, var='kc', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.differential_dynamics_plot(adata, gene, var='rho', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.differential_dynamics_plot(adata, gene, var='v', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.differential_dynamics_plot(adata, gene, var='s', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.dynamic_plot(adata_sub, adata_atac[adata_sub.obs_names,:], gene, color_by='leiden', by='velocity', axis_on=False, frame_on=False)
vv.dynamic_plot(adata_sub, adata_atac[adata_sub.obs_names,:], gene, color_by='leiden', batch_correction=True, axis_on=False, frame_on=False)
Compare Erythrocyte and Megakaryocyte
[67]:
adata_sub2 = adata[adata.obs['leiden'].isin(['CMP', 'Erythrocyte', 'Granulocyte', 'HSC', 'MEP',
'Mast Cells', 'Megakaryocyte', 'Platelet', 'Prog MK'])].copy()
adata_sub2 = adata_sub2[adata_sub2.obsm['X_z_umap'][:,0]>np.percentile(adata_sub2.obsm['X_z_umap'][:,0], 0.28)].copy()
scv.pl.scatter(adata_sub2, basis='z_umap', color='leiden', legend_loc='right', title='')
adata_atac_sub2 = adata_atac[adata_sub2.obs_names,:]
[68]:
a = np.isin(adata.obs['leiden'], ['Megakaryocyte', 'Platelet'])
b = np.isin(adata.obs['leiden'], ['Erythrocyte'])
[69]:
df_dd_ = vv.model.differential_dynamics(adata, adata_atac, model=model, batch_key='batch', idx1=a, idx2=b, mode='change')
Using reference batch for latent variable computation.
Using reference batch for latent variable computation.
[70]:
df_dd_["log10_pscore"] = np.log10(df_dd_["p_v_no_change"])
df_dd_["gene_type"] = "Other"
df_dd_.loc[(df_dd_['log2_diff_v']>4).to_numpy() & (df_dd_['log10_pscore']<np.log10(0.05)).to_numpy(), "gene_type"] = 'LD>4&Pval<0.05'
df_dd_.loc[(df_dd_['log2_diff_v']<-2).to_numpy() & (df_dd_['log10_pscore']<np.log10(0.05)).to_numpy(), "gene_type"] = 'LD<-2&Pval<0.05'
df_dd_['gene_name'] = df_dd_.index
df_dd_.loc[~np.isin(df_dd_['gene_type'], ['LD>4&Pval<0.05', 'LD<-2&Pval<0.05']), 'gene_name'] = np.nan
[71]:
df_dd_.groupby(['gene_type']).size()
[71]:
gene_type
LD<-2&Pval<0.05 27
LD>4&Pval<0.05 40
Other 862
dtype: int64
[72]:
(
p9.ggplot(df_dd_, p9.aes("log2_diff_v", "-log10_pscore", color="gene_type"))
+ p9.geom_point(
df_dd_.query("gene_type == 'Other'"), alpha=0.5
) # Plot other genes with transparence
+ p9.geom_point(df_dd_.query("gene_type != 'Other'"))
+ p9.labs(x="Mean log difference", y="Significance score")
+ p9.theme_minimal()
)
[72]:
<ggplot: (8746974510216)>
[73]:
adata_copy2 = adata_sub2.copy()
adata_copy2.X = adata_copy2.layers[f'{key}_velocity'].copy()
adata_copy2.X = adata_copy2.X / np.max(np.abs(adata_copy2.X), axis=0)
[ ]:
scv.pl.scatter(adata_copy2, basis='z_umap', color=df_dd_.sort_values(by=['bayes_factor_v', 'log2_diff_v'], ascending=[False, False])[df_dd_['log2_diff_v']>0].index[:9], frameon=False, ncols=3, colorbar=False, wspace=0.1, hspace=0.1,
fontsize=30, color_map='coolwarm', norm=mcolors.CenteredNorm(halfrange=1))
[ ]:
scv.pl.scatter(adata_copy2, basis='z_umap', color=df_dd_.sort_values(by=['bayes_factor_v', 'log2_diff_v'], ascending=[False, True])[df_dd_['log2_diff_v']<0].index[:9], frameon=False, ncols=3, colorbar=False, wspace=0.1, hspace=0.1,
fontsize=30, color_map='coolwarm', norm=mcolors.CenteredNorm(halfrange=1))
[76]:
color_include = ['Erythrocyte', 'MEP', 'Megakaryocyte', 'Prog MK', 'Platelet']
[77]:
df_dd_mk = vv.model.differential_dynamics(adata, adata_atac, model=model, batch_key='batch', idx1=a, idx2=b, mode='change', save_raw=True)
Using reference batch for latent variable computation.
Using reference batch for latent variable computation.
[78]:
plt.hist(np.log(df_dd_mk['mean_c1']), bins=100, alpha=0.7);
plt.hist(np.log(df_dd_mk['mean_u1']), bins=100, alpha=0.7);
plt.hist(np.log(df_dd_mk['mean_s1']), bins=100, alpha=0.7);
[79]:
mk_ddg = df_dd_mk.sort_values(by='bayes_factor_v', ascending=False)[(df_dd_mk['log2_diff_v']>0)
& (np.log(df_dd_mk['mean_c1'])>-1)
& (np.log(df_dd_mk['mean_u1'])>-3)
& (np.log(df_dd_mk['mean_s1'])>-3)].index
len(mk_ddg)
[79]:
227
[80]:
mk_ddg
[80]:
Index(['HPSE', 'GNB5', 'KALRN', 'THBS1', 'LTBP1', 'ABCC3', 'ITGB3', 'TRPC6',
'ITGA6', 'SH3PXD2A',
...
'CENPF', 'PXK', 'NUCB2', 'SPINT2', 'ATP10D', 'SCAPER', 'NT5C3A',
'B4GALT5', 'SLC12A2', 'TCTN3'],
dtype='object', length=227)
[81]:
gene = 'SELP'
vv.differential_dynamics_plot(adata, gene, var='kc', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.differential_dynamics_plot(adata, gene, var='rho', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.differential_dynamics_plot(adata, gene, var='v', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.differential_dynamics_plot(adata, gene, var='s', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.dynamic_plot(adata_sub2, adata_atac[adata_sub2.obs_names,:], gene, color_by='leiden', by='velocity', axis_on=False, frame_on=False)
vv.dynamic_plot(adata_sub2, adata_atac[adata_sub2.obs_names,:], gene, color_by='leiden', batch_correction=True, axis_on=False, frame_on=False)
[82]:
gene = 'PF4'
vv.differential_dynamics_plot(adata, gene, var='kc', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.differential_dynamics_plot(adata, gene, var='rho', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.differential_dynamics_plot(adata, gene, var='v', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.differential_dynamics_plot(adata, gene, var='s', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.dynamic_plot(adata_sub2, adata_atac[adata_sub2.obs_names,:], gene, color_by='leiden', by='velocity', axis_on=False, frame_on=False)
vv.dynamic_plot(adata_sub2, adata_atac[adata_sub2.obs_names,:], gene, color_by='leiden', batch_correction=True, axis_on=False, frame_on=False)
[83]:
gene = 'ACTN1'
vv.differential_dynamics_plot(adata, gene, var='kc', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.differential_dynamics_plot(adata, gene, var='rho', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.differential_dynamics_plot(adata, gene, var='v', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.differential_dynamics_plot(adata, gene, var='s', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.dynamic_plot(adata_sub2, adata_atac[adata_sub2.obs_names,:], gene, color_by='leiden', by='velocity', axis_on=False, frame_on=False)
vv.dynamic_plot(adata_sub2, adata_atac[adata_sub2.obs_names,:], gene, color_by='leiden', batch_correction=True, axis_on=False, frame_on=False)
[84]:
df_dd_rb = vv.model.differential_dynamics(adata, adata_atac, model=model, batch_key='batch', idx1=b, idx2=a, mode='change', save_raw=True)
Using reference batch for latent variable computation.
Using reference batch for latent variable computation.
[85]:
plt.hist(np.log(df_dd_rb['mean_c1']), bins=100, alpha=0.7);
plt.hist(np.log(df_dd_rb['mean_u1']), bins=100, alpha=0.7);
plt.hist(np.log(df_dd_rb['mean_s1']), bins=100, alpha=0.7);
[86]:
rb_ddg = df_dd_rb.sort_values(by='bayes_factor_v', ascending=False)[(df_dd_rb['log2_diff_v']>0)
& (np.log(df_dd_rb['mean_c1'])>-1)
& (np.log(df_dd_rb['mean_u1'])>-3)
& (np.log(df_dd_rb['mean_s1'])>-3)].index
len(rb_ddg)
[86]:
200
[87]:
rb_ddg
[87]:
Index(['MINPP1', 'MYH10', 'FZD3', 'TFR2', 'XACT', 'HSPG2', 'EMID1', 'SLC40A1',
'DEPTOR', 'SLC12A6',
...
'TCF4', 'NCOA4', 'DDX10', 'NLN', 'MALT1', 'ASPH', 'BMP2K', 'PDE7A',
'ATP9B', 'FDX1'],
dtype='object', length=200)
[88]:
gene = 'TFR2'
vv.differential_dynamics_plot(adata, gene, var='kc', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.differential_dynamics_plot(adata, gene, var='rho', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.differential_dynamics_plot(adata, gene, var='v', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.differential_dynamics_plot(adata, gene, var='s', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.dynamic_plot(adata_sub2, adata_atac[adata_sub2.obs_names,:], gene, color_by='leiden', by='velocity', axis_on=False, frame_on=False)
vv.dynamic_plot(adata_sub2, adata_atac[adata_sub2.obs_names,:], gene, color_by='leiden', batch_correction=True, axis_on=False, frame_on=False)
[89]:
gene = 'SLC40A1'
vv.differential_dynamics_plot(adata, gene, var='kc', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.differential_dynamics_plot(adata, gene, var='rho', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.differential_dynamics_plot(adata, gene, var='v', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.differential_dynamics_plot(adata, gene, var='s', color_by='leiden', color_include=color_include, title=True, legend=False, axis_on=False, frame_on=False)
vv.dynamic_plot(adata_sub2, adata_atac[adata_sub2.obs_names,:], gene, color_by='leiden', by='velocity', axis_on=False, frame_on=False)
vv.dynamic_plot(adata_sub2, adata_atac[adata_sub2.obs_names,:], gene, color_by='leiden', batch_correction=True, axis_on=False, frame_on=False)
Save the final result
[90]:
adata.write_h5ad(data_path_base+"/final.h5ad")
[ ]: