Preprocessing of single multi-omic dataset

[ ]:

import csv
import numpy as np
import pandas as pd
import scanpy as sc
import scvelo as scv
import matplotlib.pyplot as plt
import multivelovae as vv

[2]:

scv.settings.verbosity = 3
scv.settings.presenter_view = True
scv.set_figure_params('scvelo')
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 200)
np.set_printoptions(suppress=True)

[3]:

# Define cell cycle genes
s_genes_list = \
    ['Mcm5', 'Pcna', 'Tyms', 'Fen1', 'Mcm2', 'Mcm4', 'Rrm1', 'Ung', 'Gins2',
     'Mcm6', 'Cdca7', 'Dtl', 'Prim1', 'Uhrf1', 'CENPU', 'Hells', 'Rfc2',
     'Rpa2', 'Nasp', 'Rad51ap1', 'Gmnn', 'Wdr76', 'Slbp', 'Ccne2', 'Ubr7',
     'Pold3', 'Msh2', 'Atad2', 'Rad51', 'Rrm2', 'Cdc45', 'Cdc6', 'Exo1', 'Tipin',
     'Dscc1', 'Blm', 'Casp8ap2', 'Usp1', 'Clspn', 'Pola1', 'Chaf1b', 'Brip1', 'E2f8']

g2m_genes_list = \
    ['Hmgb2', 'Cdk1', 'Nusap1', 'Ube2c', 'Birc5', 'Tpx2', 'Top2a', 'Ndc80',
     'Cks2', 'Nuf2', 'Cks1b', 'Mki67', 'Tmpo', 'Cenpf', 'Tacc3', 'PIMREG',
     'Smc4', 'Ccnb2', 'Ckap2l', 'Ckap2', 'Aurkb', 'Bub1', 'Kif11', 'Anp32e',
     'Tubb4b', 'Gtse1', 'Kif20b', 'Hjurp', 'Cdca3', 'JPT1', 'Cdc20', 'Ttk',
     'Cdc25c', 'Kif2c', 'Rangap1', 'Ncapd2', 'Dlgap5', 'Cdca2', 'Cdca8',
     'Ect2', 'Kif23', 'Hmmr', 'Aurka', 'Psrc1', 'Anln', 'Lbr', 'Ckap5',
     'Cenpe', 'Ctcf', 'Nek2', 'G2e3', 'Gas2l3', 'Cbx5', 'Cenpa']
s_genes_list = np.array([x.upper() for x in s_genes_list])
g2m_genes_list = np.array([x.upper() for x in g2m_genes_list])

[4]:

# Define hemoglobin genes
hb_genes = np.array(['HBA1', 'HBA2', 'HBB', 'HBD', 'HBE1', 'HBG1', 'HBG2', 'HBQ1', 'HBM', 'HBZ'])

[5]:

# Define marker genes for various cell types
marker_genes = np.array(['PROM1', 'CRHBP', 'HLF', 'FAM30A',  # HSC
                         'AZU1', 'MPO', 'PRSS57',  # GMP
                         'SPINK2', 'C1QTNF4',  # LMPP
                         'LYZ', 'VCAN', 'S100A8',  # cDC
                         'CSTA', 'FCN1', 'CD14',  # Monocyte
                         'TCF4', 'IRF8', 'SPIB',  # pDC
                         'SLC40A1',  # MEP
                         'HBB', 'HBD', 'KLF1', 'CA1',  # Erythrocyte
                         'HDC', 'PRG2', 'LMO4', 'SLC45A3', 'IKZF2',  # Granulocyte
                         'RHEX', 'SIGLEC6', 'HPGDS', 'TPSAB1',  # Mast Cells
                         'ELANE',  # NK
                         'IL7R',  # Prog T
                         'CD72', 'DNTT', 'EBF1',  # Prog B
                         'FCER1A', 'F2R',  # Megakaryocyte
                         'PLEK', 'PPBP', 'ITGA2B', 'PF4', 'TUBB1',  # Platelet
                         'FABP4', 'CD36',  # Adipocyte
                         'LYST', 'C1QC', 'CD86', 'SPP1', 'GPNMB', 'MRC1', 'CTSS', 'FTL'  # Macrophage
                         ])

Preprocess ATAC counts

[6]:

adata_atac_ = sc.read_10x_mtx('filtered_feature_bc_matrix/', var_names='gene_symbols', cache=True, gex_only=False)
adata_atac_ = adata_atac_[:,adata_atac_.var['feature_types'] == "Peaks"]
adata_atac_

[6]:

View of AnnData object with n_obs × n_vars = 10067 × 165654
    var: 'gene_ids', 'feature_types'

[7]:

adata_atac_ = vv.aggregate_peaks_10x(adata_atac_,
                                     'atac_peak_annotation.tsv',
                                     'analysis/feature_linkage/feature_linkage.bedpe',
                                     verbose=True)

CellRanger ARC identified as 2.0.0
Found 20885 genes with promoter peaks

[8]:

plt.hist(adata_atac_.X.sum(1), bins=100);

[9]:

sc.pp.filter_cells(adata_atac_, min_counts=5000)

Preprocess RNA counts

[10]:

adata_ = sc.read_10x_mtx('filtered_feature_bc_matrix/', var_names='gene_symbols', cache=True, gex_only=True)
adata_.var_names_make_unique()

[11]:

plt.hist(adata_.X.sum(1), bins=100);

[12]:

sc.pp.filter_cells(adata_, min_genes=500)
sc.pp.filter_genes(adata_, min_cells=10)

[13]:

# Compute QC metrics
adata_.var['mt'] = adata_.var_names.str.startswith('MT-')
adata_.var['rb'] = adata_.var_names.str.startswith(('RPS', 'RPL'))
adata_.var['hla'] = adata_.var_names.str.startswith('HLA-')
adata_.var['hb'] = np.in1d(adata_.var_names, hb_genes)
sc.pp.calculate_qc_metrics(adata_, qc_vars=['mt', 'rb', 'hla', 'hb'], percent_top=[20], log1p=True, inplace=True)

[14]:

sc.pl.violin(adata_, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt', 'pct_counts_rb', 'pct_counts_hla', 'pct_counts_hb', 'pct_counts_in_top_20_genes'], jitter=0.4, multi_panel=True)

[15]:

sc.pl.scatter(adata_, x='total_counts', y='n_genes_by_counts', color='pct_counts_mt')
sc.pl.scatter(adata_, x='log1p_total_counts', y='log1p_n_genes_by_counts', color='pct_counts_mt')
sc.pl.scatter(adata_, x='total_counts', y='pct_counts_mt', color='pct_counts_in_top_20_genes')
sc.pl.scatter(adata_, x='total_counts', y='pct_counts_rb', color='pct_counts_mt')
sc.pl.scatter(adata_, x='pct_counts_mt', y='pct_counts_rb', color='n_genes_by_counts')
sc.pl.scatter(adata_, x='total_counts', y='pct_counts_hla', color='pct_counts_mt')
sc.pl.scatter(adata_, x='total_counts', y='pct_counts_hb', color='pct_counts_mt')

[16]:

# Filter out outliers based on QC metrics with autodetected thresholds
adata_.obs["outlier"] = (
    vv.is_outlier(adata_, "log1p_total_counts", 3, 4)
    | vv.is_outlier(adata_, "log1p_n_genes_by_counts", 3, 4)
    | vv.is_outlier(adata_, "pct_counts_in_top_20_genes", 4, 4)
    | vv.is_outlier(adata_, "pct_counts_mt", 4, 3)
    | vv.is_outlier(adata_, "pct_counts_rb", 3, 4)
)
adata_.obs.outlier.value_counts()

total_counts lower_bound 3421.936767913161, upper_bound 33089.59415179074
n_genes_by_counts lower_bound 2007.9104862490526, upper_bound 8357.539854741306
pct_counts_in_top_20_genes lower_bound 9.288843340823774, upper_bound 23.440522986465083
pct_counts_mt lower_bound 1.85882568359375, upper_bound 18.090338706970215
pct_counts_rb lower_bound 5.138129234313965, upper_bound 20.41178321838379

[16]:

False    8697
True     1305
Name: outlier, dtype: int64

[17]:

adata_ = adata_[~adata_.obs.outlier].copy()
adata_

[17]:

AnnData object with n_obs × n_vars = 8697 × 22527
    obs: 'n_genes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_rb', 'log1p_total_counts_rb', 'pct_counts_rb', 'total_counts_hla', 'log1p_total_counts_hla', 'pct_counts_hla', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'rb', 'hla', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'

[18]:

sc.pp.filter_genes(adata_, min_counts=20)

Filter highly-variable genes in total RNA space

[19]:

sc.pp.highly_variable_genes(adata_, n_top_genes=3000, flavor="seurat_v3")
sc.pl.highly_variable_genes(adata_)

[20]:

np.sum(adata_.var.highly_variable)

[20]:

Preprocess unspliced and spliced counts from STARsolo

[21]:

adata = sc.read_10x_mtx('starsolo-filtered/', var_names='gene_symbols', make_unique=True, cache=True, gex_only=True)
adata.layers['unspliced'] = sc.read_10x_mtx('starsolo-filtered/', prefix='unspliced_', var_names='gene_symbols', make_unique=True, cache=True, gex_only=True).X
adata.layers['spliced'] = sc.read_10x_mtx('starsolo-filtered/', prefix='spliced_', var_names='gene_symbols', make_unique=True, cache=True, gex_only=True).X
adata.obs_names = [x + '-1' for x in adata.obs_names]

[22]:

shared_cells = np.intersect1d(adata.obs_names, adata_.obs_names)
adata = adata[shared_cells,:]
adata

[22]:

View of AnnData object with n_obs × n_vars = 8697 × 36601
    var: 'gene_ids', 'feature_types'
    layers: 'unspliced', 'spliced'

[23]:

adata.var['HVG'] = False
adata.var.loc[adata_.var_names[adata_.var.highly_variable], 'HVG'] = True

[24]:

sc.pp.filter_genes(adata, min_cells=10)

[25]:

plt.hist(adata.X.sum(1), bins=100);

[26]:

plt.hist(adata.layers['unspliced'].sum(1), bins=100, alpha=0.5);
plt.hist(adata.layers['spliced'].sum(1), bins=100, alpha=0.5);

[27]:

# Define some extra metrics
counts_u = adata.layers['unspliced'].sum(1).A1
counts_s = adata.layers['spliced'].sum(1).A1
adata.obs['total_unspliced'] = counts_u
adata.obs['total_spliced'] = counts_s
adata.obs['log1p_total_unspliced'] = np.log1p(counts_u)
adata.obs['log1p_total_spliced'] = np.log1p(counts_s)
adata.obs['fraction_u'] = counts_u / (counts_s + counts_u)

[28]:

adata.var['mt'] = adata.var_names.str.startswith('MT-')
adata.var['rb'] = adata.var_names.str.startswith(('RPS', 'RPL'))
adata.var['hla'] = adata.var_names.str.startswith('HLA-')
adata.var['hb'] = np.in1d(adata.var_names, hb_genes)
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt', 'rb', 'hla', 'hb'], percent_top=[20], log1p=True, inplace=True)

[29]:

sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt', 'pct_counts_rb', 'pct_counts_hla', 'pct_counts_hb', 'pct_counts_in_top_20_genes'], jitter=0.4, multi_panel=True)

[30]:

sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts', color='pct_counts_mt')
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt', color='pct_counts_in_top_20_genes')
sc.pl.scatter(adata, x='total_counts', y='pct_counts_rb', color='pct_counts_mt')
sc.pl.scatter(adata, x='pct_counts_mt', y='pct_counts_rb', color='n_genes_by_counts')

[31]:

# Filter out outliers based on QC metrics with autodetected thresholds
adata.obs["outlier"] = (
    vv.is_outlier(adata, "log1p_total_counts", 3, 4)
    | vv.is_outlier(adata, "log1p_total_unspliced", 4, 5)
    | vv.is_outlier(adata, "log1p_total_spliced", 4, 5)
    | vv.is_outlier(adata, "pct_counts_mt", 4, 3)
    | vv.is_outlier(adata, "pct_counts_rb", 3, 4)
)
adata.obs.outlier.value_counts()

total_counts lower_bound 2425.4803786099255, upper_bound 22132.372237304127
total_unspliced lower_bound 982.7264742937743, upper_bound 12939.346125136475
total_spliced lower_bound 1481.9934033993256, upper_bound 24207.00390625
pct_counts_mt lower_bound 6.206668853759766, upper_bound 24.5504732131958
pct_counts_rb lower_bound 10.420002937316895, upper_bound 26.875526428222656

[31]:

False    8193
True      504
Name: outlier, dtype: int64

[32]:

adata = adata[~adata.obs.outlier].copy()
adata

[32]:

AnnData object with n_obs × n_vars = 8193 × 18027
    obs: 'total_unspliced', 'total_spliced', 'log1p_total_unspliced', 'log1p_total_spliced', 'fraction_u', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_rb', 'log1p_total_counts_rb', 'pct_counts_rb', 'total_counts_hla', 'log1p_total_counts_hla', 'pct_counts_hla', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier'
    var: 'gene_ids', 'feature_types', 'HVG', 'n_cells', 'mt', 'rb', 'hla', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    layers: 'unspliced', 'spliced'

[33]:

scv.pp.filter_genes(adata, min_shared_counts=20)
shared_genes = np.intersect1d(adata.var_names, adata_atac_.var_names)
adata = adata[:, shared_genes]
adata

Filtered out 7804 genes that are detected 20 counts (shared).

[33]:

View of AnnData object with n_obs × n_vars = 8193 × 9844
    obs: 'total_unspliced', 'total_spliced', 'log1p_total_unspliced', 'log1p_total_spliced', 'fraction_u', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_rb', 'log1p_total_counts_rb', 'pct_counts_rb', 'total_counts_hla', 'log1p_total_counts_hla', 'pct_counts_hla', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'initial_size_unspliced', 'initial_size_spliced', 'initial_size'
    var: 'gene_ids', 'feature_types', 'HVG', 'n_cells', 'mt', 'rb', 'hla', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    layers: 'unspliced', 'spliced'

[34]:

adata.obs["outlier"] = (
    vv.is_outlier(adata, "fraction_u")
)
adata.obs.outlier.value_counts()

fraction_u lower_bound 0.07987459003925323, upper_bound 0.6870881915092468

[34]:

False    8193
Name: outlier, dtype: int64

[35]:

shared_cells = pd.Index(np.intersect1d(adata.obs_names, adata_atac_.obs_names))
shared_genes = pd.Index(np.intersect1d(adata.var_names, adata_atac_.var_names))
shared_cells, shared_genes

[35]:

(Index(['AAACAGCCAGCCGCTA-1', 'AAACAGCCAGTAGCCT-1', 'AAACATGCAAGGTACG-1',
        'AAACATGCAATTAACC-1', 'AAACATGCAATTGAAG-1', 'AAACATGCACCTCACC-1',
        'AAACATGCAGGCTAGA-1', 'AAACCAACAAAGCTCC-1', 'AAACCAACACATGCTA-1',
        'AAACCAACAGCCGCTA-1',
        ...
        'TTTGTGAAGTTGTCAA-1', 'TTTGTGGCAAGCGAGC-1', 'TTTGTGGCAGACAAAC-1',
        'TTTGTGGCAGCAAGGC-1', 'TTTGTGGCAGGCTGTT-1', 'TTTGTGGCAGGTCCTG-1',
        'TTTGTGTTCCATCAGG-1', 'TTTGTGTTCCCGCATT-1', 'TTTGTTGGTGCATCGG-1',
        'TTTGTTGGTGTGAGAG-1'],
       dtype='object', length=6336),
 Index(['A1BG-AS1', 'AAAS', 'AACS', 'AADAT', 'AAGAB', 'AAK1', 'AAMDC', 'AAMP',
        'AARS', 'AARS2',
        ...
        'ZSWIM8', 'ZSWIM9', 'ZUP1', 'ZW10', 'ZWILCH', 'ZWINT', 'ZXDC', 'ZYG11B',
        'ZYX', 'ZZEF1'],
       dtype='object', length=9844))

[ ]:

adata = adata[shared_cells, shared_genes].copy()
adata_atac = adata_atac_[shared_cells, shared_genes].copy()

Save preprocessed AnnData before normalization

[37]:

adata.write_h5ad('8489-MV-1_adata_prepro.h5ad')
adata_atac.write_h5ad('8489-MV-1_adata_atac_prepro.h5ad')

[38]:

adata = sc.read_h5ad('8489-MV-1_adata_prepro.h5ad')
adata_atac = sc.read_h5ad('8489-MV-1_adata_atac_prepro.h5ad')

[39]:

adata

[39]:

AnnData object with n_obs × n_vars = 6336 × 9844
    obs: 'total_unspliced', 'total_spliced', 'log1p_total_unspliced', 'log1p_total_spliced', 'fraction_u', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_rb', 'log1p_total_counts_rb', 'pct_counts_rb', 'total_counts_hla', 'log1p_total_counts_hla', 'pct_counts_hla', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'initial_size_unspliced', 'initial_size_spliced', 'initial_size'
    var: 'gene_ids', 'feature_types', 'HVG', 'n_cells', 'mt', 'rb', 'hla', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    layers: 'spliced', 'unspliced'

Normalize ATAC and RNA counts

[40]:

vv.model.tfidf_norm(adata_atac)

[41]:

scv.pp.normalize_per_cell(adata)

Normalized count data: X, spliced, unspliced.

Compute cell cycle scores

[42]:

s_genes_list_sub = s_genes_list[np.isin(s_genes_list, adata.var_names)]
g2m_genes_list_sub = g2m_genes_list[np.isin(g2m_genes_list, adata.var_names)]
scv.tl.score_genes_cell_cycle(adata, s_genes=s_genes_list_sub, g2m_genes=g2m_genes_list_sub)

calculating cell cycle phase
-->     'S_score' and 'G2M_score', scores of cell cycle phases (adata.obs)

Check genes influenced by cell cycle

[43]:

adata_copy = adata.copy()

[44]:

# Regress out cell cycle effect in a copy and add intercept back
vv.regress_out(adata_copy, ['S_score', 'G2M_score'], add_intercept=True)

[45]:

# Select genes in a copy based on dispersions after regressing out cell cycle
vv.filter_genes_dispersion(adata_copy, min_mean=0.0125, max_mean=4, min_disp=0.4, subset=False)
sc.pl.highly_variable_genes(adata_copy)

Extracted 1772 highly variable genes.

[46]:

if 'PROM1' in adata_copy.var_names:
    print(adata_copy[:, 'PROM1'].var)

              gene_ids    feature_types   HVG  n_cells     mt     rb    hla  \
PROM1  ENSG00000007062  Gene Expression  True     2496  False  False  False

          hb  n_cells_by_counts  mean_counts  log1p_mean_counts  \
PROM1  False               2496     0.430493           0.358019

       pct_dropout_by_counts  total_counts  log1p_total_counts  \
PROM1              71.300448        3744.0            8.228177

       gene_count_corr    means  dispersions  dispersions_norm  \
PROM1          -0.1283  0.34751     0.495241          1.700807

       highly_variable
PROM1             True

[47]:

np.sum(adata_copy.var.highly_variable), np.sum(adata_copy.var.HVG), np.sum(adata_copy.var.highly_variable & adata_copy.var.HVG)

[47]:

(1772, 1601, 816)

[48]:

sc.pp.log1p(adata)

[49]:

# Compute cell cycle phase difference
scv.tl.score_genes_cell_cycle(adata, s_genes=s_genes_list_sub, g2m_genes=g2m_genes_list_sub)
adata.obs['CC_difference'] = adata.obs['S_score'] - adata.obs['G2M_score']
len(s_genes_list_sub), len(g2m_genes_list_sub)

calculating cell cycle phase
-->     'S_score' and 'G2M_score', scores of cell cycle phases (adata.obs)

[49]:

(42, 52)

[50]:

# Recompute some extra metrics
counts_u = adata.layers['unspliced'].sum(1).A1
counts_s = adata.layers['spliced'].sum(1).A1
adata.obs['total_unspliced'] = counts_u
adata.obs['total_spliced'] = counts_s
adata.obs['log1p_total_unspliced'] = np.log1p(counts_u)
adata.obs['log1p_total_spliced'] = np.log1p(counts_s)
adata.obs['fraction_u'] = counts_u / (counts_s + counts_u)

[51]:

adata = adata[:, adata_copy.var.highly_variable & adata_copy.var.HVG]
adata.var['highly_variable'] = True

[52]:

adata_atac = adata_atac[adata.obs_names, adata.var_names]

[53]:

adata

[53]:

AnnData object with n_obs × n_vars = 6336 × 816
    obs: 'total_unspliced', 'total_spliced', 'log1p_total_unspliced', 'log1p_total_spliced', 'fraction_u', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_rb', 'log1p_total_counts_rb', 'pct_counts_rb', 'total_counts_hla', 'log1p_total_counts_hla', 'pct_counts_hla', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'initial_size_unspliced', 'initial_size_spliced', 'initial_size', 'n_counts', 'S_score', 'G2M_score', 'phase', 'CC_difference'
    var: 'gene_ids', 'feature_types', 'HVG', 'n_cells', 'mt', 'rb', 'hla', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'gene_count_corr', 'highly_variable'
    uns: 'log1p'
    layers: 'spliced', 'unspliced'

[54]:

# Recompute some extra metrics
counts_u = adata.layers['unspliced'].sum(1).A1
counts_s = adata.layers['spliced'].sum(1).A1
adata.obs['total_unspliced2'] = counts_u
adata.obs['total_spliced2'] = counts_s
adata.obs['log1p_total_unspliced2'] = np.log1p(counts_u)
adata.obs['log1p_total_spliced2'] = np.log1p(counts_s)
adata.obs['fraction_u2'] = counts_u / (counts_s + counts_u)

[55]:

# Regress out cell cycle difference from original adata
sc.pp.regress_out(adata, ['CC_difference'])

[56]:

sc.pp.scale(adata)

[57]:

scv.pp.moments(adata, n_pcs=30, n_neighbors=50)

computing neighbors
    finished (0:00:02) --> added
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing moments based on connectivities
    finished (0:00:00) --> added
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)

[58]:

scv.tl.umap(adata)

[59]:

adata.obs['n_c'] = adata_atac.X.sum(1)
adata.obs['n_Mu'] = np.sum(adata.layers['Mu'], 1)
adata.obs['n_Ms'] = np.sum(adata.layers['Ms'], 1)

Visualize QC metrics on UMAP

[60]:

sc.pl.umap(adata, color=['n_counts', 'n_c', 'n_Mu', 'n_Ms',
                         'S_score', 'G2M_score', 'total_unspliced', 'total_spliced',
                         'fraction_u', 'total_unspliced2', 'total_spliced2', 'fraction_u2',
                         'total_counts', 'pct_counts_mt', 'pct_counts_rb', 'pct_counts_hla',
                         'pct_counts_hb', 'pct_counts_in_top_20_genes', 'CC_difference'])

/home/chen/.mambaforge/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:163: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.
  cmap = copy(get_cmap(cmap))

Visualize marker gene expressions on UMAP

[61]:

mg = marker_genes[np.isin(marker_genes, adata.var_names)]
sc.pl.umap(adata, color=mg)

/home/chen/.mambaforge/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:163: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.
  cmap = copy(get_cmap(cmap))

Cluster and annotation

[62]:

sc.tl.leiden(adata, resolution=1.6)
sc.pl.umap(adata, color='leiden')

/home/chen/.mambaforge/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:163: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.
  cmap = copy(get_cmap(cmap))

[ ]:

new_cluster_names = ['LMPP',
                     'Megakaryocyte',
                     'HSC 1',
                     'Prog MK',
                     'HSC 2',
                     'GMP',
                     'MEP',
                     'Prog DC',
                     'Erythrocyte',
                     'CMP',
                     'Granulocyte',
                     'Platelet',
                     'DC',
                     'Mast Cells'
                     ]
adata.rename_categories('leiden', new_cluster_names)
new_cluster_names = adata.obs['leiden'].to_numpy()
new_cluster_names[(new_cluster_names == 'HSC 1') | (new_cluster_names == 'HSC 2')] = 'HSC'
adata.obs['leiden'] = new_cluster_names
del adata.uns['leiden_colors']
sc.pl.umap(adata, color='leiden')

/home/chen/.mambaforge/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:163: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.
  cmap = copy(get_cmap(cmap))

[64]:

sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')
sc.pl.rank_genes_groups(adata, n_genes=15, sharey=False)

[65]:

adata

[65]:

AnnData object with n_obs × n_vars = 6336 × 816
    obs: 'total_unspliced', 'total_spliced', 'log1p_total_unspliced', 'log1p_total_spliced', 'fraction_u', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_rb', 'log1p_total_counts_rb', 'pct_counts_rb', 'total_counts_hla', 'log1p_total_counts_hla', 'pct_counts_hla', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'initial_size_unspliced', 'initial_size_spliced', 'initial_size', 'n_counts', 'S_score', 'G2M_score', 'phase', 'CC_difference', 'total_unspliced2', 'total_spliced2', 'log1p_total_unspliced2', 'log1p_total_spliced2', 'fraction_u2', 'n_c', 'n_Mu', 'n_Ms', 'leiden'
    var: 'gene_ids', 'feature_types', 'HVG', 'n_cells', 'mt', 'rb', 'hla', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'gene_count_corr', 'highly_variable', 'mean', 'std'
    uns: 'log1p', 'pca', 'neighbors', 'umap', 'leiden', 'leiden_colors', 'rank_genes_groups'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'spliced', 'unspliced', 'Ms', 'Mu'
    obsp: 'distances', 'connectivities'

Prepare Seurat WNN

[66]:

with open('filtered_cells.txt', 'w', newline ='') as f:
    writer = csv.writer(f, delimiter=',')
    writer.writerows(adata_atac.obs_names[:,None])
with open('leiden.txt', 'w', newline ='') as f:
    writer = csv.writer(f, delimiter=',')
    writer.writerows(np.array(adata.obs['leiden'].values)[:,None])

[67]:

### Run seurat_wnn.R ###

[68]:

nn_idx = np.loadtxt("seurat/nn_idx.txt", delimiter=',')
nn_dist = np.loadtxt("seurat/nn_dist.txt", delimiter=',')
nn_cell_names = list(pd.read_csv("seurat/nn_cell_names.txt", header=None)[0])
nn_idx.shape

[68]:

(6336, 50)

[69]:

np.all(nn_cell_names == adata.obs_names)

[69]:

True

[70]:

adata_atac.obs['leiden'] = adata.obs['leiden']

Smooth accessibilities by neighbors

[71]:

vv.model.knn_smooth_chrom(adata_atac, nn_idx, nn_dist)

[72]:

adata, adata_atac

[72]:

(AnnData object with n_obs × n_vars = 6336 × 816
     obs: 'total_unspliced', 'total_spliced', 'log1p_total_unspliced', 'log1p_total_spliced', 'fraction_u', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_rb', 'log1p_total_counts_rb', 'pct_counts_rb', 'total_counts_hla', 'log1p_total_counts_hla', 'pct_counts_hla', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'initial_size_unspliced', 'initial_size_spliced', 'initial_size', 'n_counts', 'S_score', 'G2M_score', 'phase', 'CC_difference', 'total_unspliced2', 'total_spliced2', 'log1p_total_unspliced2', 'log1p_total_spliced2', 'fraction_u2', 'n_c', 'n_Mu', 'n_Ms', 'leiden'
     var: 'gene_ids', 'feature_types', 'HVG', 'n_cells', 'mt', 'rb', 'hla', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'gene_count_corr', 'highly_variable', 'mean', 'std'
     uns: 'log1p', 'pca', 'neighbors', 'umap', 'leiden', 'leiden_colors', 'rank_genes_groups'
     obsm: 'X_pca', 'X_umap'
     varm: 'PCs'
     layers: 'spliced', 'unspliced', 'Ms', 'Mu'
     obsp: 'distances', 'connectivities',
 AnnData object with n_obs × n_vars = 6336 × 816
     obs: 'n_counts', 'leiden'
     layers: 'Mc'
     obsp: 'connectivities')

Save post-processed AnnData

[73]:

adata.write_h5ad('8489-MV-1_adata_postpro.h5ad')
adata_atac.write_h5ad('8489-MV-1_adata_atac_postpro.h5ad')

[ ]: