Preleukemia scRNAseq Sample Analysis (Part 2 in Python)¶
Naraen Palanikumar¶
This is part 2 of the analysis and dashboarding project completed using publicly available scRNAseq/gene signature/reference atlas data from this paper (https://doi.org/10.1016/j.xgen.2023.100426) concerning gene markers associated with determine Acute myeloid leukemia (AML) afflicted patient outcomes and AML clinical data from NCI.
InĀ [1]:
# --- Imports Section ---
import scanpy as sc
import cellrank as cr
import numpy as np
import pandas as pd
import seaborn as sns
import decoupler as dc
from gprofiler import GProfiler
import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter
from lifelines.statistics import logrank_test
InĀ [2]:
# --- Load Data ---
adata = sc.read_h5ad("data/seurat_inter/seurat_integrated.h5ad")
InĀ [3]:
# --- Prepare Data ---
adada = sc.pp.subsample(adata, n_obs=30000) #need to subsample to avoid crashing when computing fate probabilities
adata.obs['celltype'] = adata.obs['predicted.id']
adata = adata[adata.obs['celltype'] != 'Unassigned'].copy()
adata.obs['celltype'] = adata.obs['celltype'].astype('category')
InĀ [4]:
# --- Cell Rank Workflow ---
start_states = ['HSCs'] #the cells that we start with
end_states = ['Late_Ery', 'Lym', 'Baso', 'Mast', 'Megakry', 'MO_DC', 'Neu'] #final mature cells
#running nearest neighbors and diffmap
sc.pp.neighbors(adata) # This computes the nearest-neighbor graph
sc.tl.diffmap(adata)
InĀ [5]:
#root cell
root_cell_name = adata.obs_names[adata.obs['celltype'] == 'HSCs'][0]
root_cell_index = np.where(adata.obs_names == root_cell_name)[0][0]
adata.uns['iroot'] = root_cell_index
sc.tl.dpt(adata, n_dcs = 10)
InĀ [6]:
#transition matrix
kernel = cr.kernels.PseudotimeKernel(adata, time_key='dpt_pseudotime')
kernel.compute_transition_matrix()
0%| | 0/29516 [00:00<?, ?cell/s]
Out[6]:
PseudotimeKernel[n=29516, dnorm=False, scheme='hard', frac_to_keep=0.3]
InĀ [7]:
#fate probabilities
estimator = cr.estimators.GPCCA(kernel)
estimator.compute_schur(n_components=15)
estimator.compute_macrostates(n_states=8, cluster_key="celltype")
/home/narae/miniconda3/envs/general/lib/python3.11/site-packages/pygpcca/_gpcca.py:1274: UserWarning: Stationary distribution couldn't be calculated. Reason: Top eigenvector has both positive and negative entries. It has range = [-2.7733622951334385e-10, 0.4924722753847856].
warnings.warn(f"Stationary distribution couldn't be calculated. Reason: {e}.")
Out[7]:
GPCCA[kernel=PseudotimeKernel[n=29516], initial_states=None, terminal_states=None]
InĀ [8]:
#correcting end states and start states
print("Identified macrostates:", list(estimator.macrostates.unique()))
hsc_mask = (adata.obs['celltype'] == 'HSCs')
initial_states_series = pd.Series(None, index=adata.obs_names, dtype="object")
initial_states_series[hsc_mask] = 'HSCs'
initial_states_categorical = initial_states_series.astype('category')
corrected_end_states = ['Neu', 'Baso', 'Immature_2', 'Megakry', 'Late_Ery', 'Mast', 'Mo_DC', 'Immature_1']
Identified macrostates: [nan, 'Neu', 'Baso', 'Immature_2', 'Megakry', 'Late_Ery', 'Mast', 'Mo_DC', 'Immature_1']
InĀ [9]:
#runs the estimator
estimator.set_terminal_states(corrected_end_states)
estimator.set_initial_states(initial_states_categorical)
Out[9]:
GPCCA[kernel=PseudotimeKernel[n=29516], initial_states=['HSCs'], terminal_states=['Baso', 'Immature_1', 'Immature_2', 'Late_Ery', 'Mast', 'Megakry', 'Mo_DC', 'Neu']]
InĀ [10]:
#compute fate probabilities
estimator.compute_fate_probabilities(solver = 'direct')
InĀ [12]:
#plot these on umap
sc.tl.umap(adata)
estimator.plot_macrostates(which = 'all', save='macrostates.png')
estimator.plot_fate_probabilities(save='fate_probabilities.png')
sc.pl.umap(
adata,
color='dpt_pseudotime', # Use the existing column
cmap='gnuplot2',
save='dpt_pseudotime.png'
)
/home/narae/miniconda3/envs/general/lib/python3.11/site-packages/scvelo/plotting/scatter.py:656: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored smp = ax.scatter( /home/narae/miniconda3/envs/general/lib/python3.11/site-packages/scvelo/plotting/scatter.py:694: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored ax.scatter( /home/narae/miniconda3/envs/general/lib/python3.11/site-packages/scvelo/plotting/utils.py:1396: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored ax.scatter(x, y, s=bg_size, marker=".", c=bg_color, zorder=zord - 2, **kwargs) /home/narae/miniconda3/envs/general/lib/python3.11/site-packages/scvelo/plotting/utils.py:1397: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored ax.scatter(x, y, s=gp_size, marker=".", c=gp_color, zorder=zord - 1, **kwargs)
saving figure to file ./figures/scvelo_macrostates.png
/home/narae/miniconda3/envs/general/lib/python3.11/site-packages/scvelo/plotting/scatter.py:656: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored smp = ax.scatter( /home/narae/miniconda3/envs/general/lib/python3.11/site-packages/scvelo/plotting/scatter.py:656: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored smp = ax.scatter(
saving figure to file ./figures/scvelo_fate_probabilities.png
WARNING: saving figure to file figures/umapdpt_pseudotime.png
InĀ [14]:
#standard umap for comparison
sc.pl.umap(
adata,
color='seurat_clusters',
legend_loc='on data',
title='UMAP by Seurat Cluster',
save='umap_seurat_clusters.png'
)
sc.pl.umap(
adata,
color='celltype',
legend_loc='on data',
title='UMAP by Cell Type Annotation',
save='umap_celltype.png'
)
WARNING: saving figure to file figures/umapumap_seurat_clusters.png
WARNING: saving figure to file figures/umapumap_celltype.png
InĀ [15]:
#make sure condition is categorical and find diffex genes (need to rename columns)
sc.tl.rank_genes_groups(adata, groupby='condition', method='wilcoxon', reference='WT')
adata.obs['condition'] = adata.obs['condition'].astype('category')
result = adata.uns['rank_genes_groups']
group_to_show = 'Mutant'
corrected_deg_df = pd.DataFrame()
gene_indices_raw = result['names'][group_to_show]
gene_indices_int = gene_indices_raw.astype(int)
corrected_deg_df['names'] = [adata.var_names[i] for i in gene_indices_int]
corrected_deg_df['logfoldchanges'] = result['logfoldchanges'][group_to_show]
corrected_deg_df['pvals_adj'] = result['pvals_adj'][group_to_show]
corrected_deg_df['scores'] = result['scores'][group_to_show]
print(corrected_deg_df.head(20))
names logfoldchanges pvals_adj scores 0 Rhd 0.025457 9.330686e-113 22.814850 1 Sec14l2 0.835223 4.267172e-108 22.329273 2 Hmox1 1.136945 1.993695e-76 18.754038 3 Hpn 0.219902 5.627877e-71 18.065952 4 Lgmn 0.957866 1.641727e-58 16.384630 5 Cfp 0.144920 2.780333e-51 15.320710 6 Dntt -0.010951 4.087441e-48 14.834328 7 Acp5 0.393075 3.879353e-46 14.519506 8 Hba-a1 0.440915 3.571375e-44 14.196410 9 Aqp9 -0.167073 1.748419e-43 14.079589 10 Wfdc17 0.065748 2.642086e-40 13.535566 11 S100a8 0.337973 5.928302e-37 12.942850 12 Ermap -0.285836 3.327176e-36 12.807915 13 Atp1b2 -0.203480 4.939833e-34 12.399111 14 C1qa 1.418073 3.388438e-33 12.240120 15 Pdia2 1.857676 7.028306e-32 11.987204 16 Itgax 2.040397 2.754480e-29 11.473605 17 Bmx 0.290402 2.206522e-27 11.077021 18 Marcks 0.056394 4.184313e-27 11.015962 19 Ptgs1 -0.015898 3.453424e-25 10.598693
InĀ [16]:
#metabolic pathway gene enrichment analysis
upregulated_genes = corrected_deg_df[(corrected_deg_df['pvals_adj'] < 0.05) & (corrected_deg_df['logfoldchanges'] > 0)]
gene_list = upregulated_genes['names'].tolist()
gp = GProfiler(return_dataframe=True)
go_results = gp.profile(
query=gene_list,
organism='hsapiens'
)
biological_processes = go_results[go_results['source'] == 'GO:BP']
metabolic_keywords = ['METABOLIC', 'GLYCOLYSIS', 'GLUCOSE', 'RESPIRATION', 'TCA', 'KREBS', 'FATTY_ACID', 'OXIDATIVE_PHOSPHORYLATION', 'METABOLISM', 'GLYCOLYTIC']
metabolic_terms = biological_processes[biological_processes['name'].str.contains('|'.join(metabolic_keywords), case=False)]
print(metabolic_terms.sort_values('p_value')[['p_value', 'name', 'intersection_size']].head(20))
p_value name \
228 0.008521 reactive oxygen species metabolic process
263 0.017585 lipid metabolic process
286 0.031159 positive regulation of macromolecule metabolic...
299 0.039138 superoxide metabolic process
303 0.042458 positive regulation of metabolic process
intersection_size
228 13
263 36
286 65
299 7
303 69
InĀ [17]:
#plotting metabolic pathway values
plot_df = metabolic_terms.sort_values('p_value')[['p_value', 'name', 'intersection_size']].head(15)
plot_df['norm_pval'] = (-1)*(np.log10(plot_df['p_value']))
plt.figure(figsize=(10, 8))
sns.barplot(
data=plot_df,
x='norm_pval',
y='name',
palette='viridis'
)
plt.title('Top Enriched Metabolic Processes in Mutant Group')
plt.xlabel('Inverse Log Adjusted P-value)')
plt.ylabel('Biological Process')
plt.tight_layout()
plt.savefig('preleuk_dashboard/plots/invlogp_metabolicpathways')
plt.show()
InĀ [18]:
#load aml public data and signatures and clean data
clinical_df = pd.read_csv('data/signatures/laml_tcga_clinical.tsv', sep='\t', comment='#', index_col = 'PATIENT_ID')
expr_df = pd.read_csv('data/signatures/laml_tcga_expression.tsv', sep='\t', index_col=0)
expr_df = expr_df.drop(columns=['Entrez_Gene_Id'])
plps_genes = pd.read_csv('data/signatures/plps.csv')['Gene'].tolist()
stem11_genes = pd.read_csv('data/signatures/stem11.csv')['Gene'].tolist()
clinical_df = clinical_df[~clinical_df.index.duplicated(keep='first')]
expr_df = expr_df.loc[:, ~expr_df.columns.duplicated(keep='first')]
clinical_df.index = clinical_df.index.str[:12]
expr_df.columns = expr_df.columns.str[:12]
expr_df.replace([np.inf, -np.inf], np.nan, inplace=True)
expr_df.fillna(0, inplace=True)
InĀ [19]:
#calculate signature scores
plps_net = pd.DataFrame({'source': 'PLPS', 'target': plps_genes, 'weight': 1})
stem11_net = pd.DataFrame({'source': 'Stem11', 'target': stem11_genes, 'weight': 1})
signature_net = pd.concat([plps_net, stem11_net], ignore_index = True)
InĀ [20]:
#run the wmean function and capture its output
patient_scores_df = dc.run_wmean(
mat = expr_df.T,
net = signature_net,
source = 'source',
target = 'target',
weight = 'weight',
min_n = 5
)
patient_scores_df = patient_scores_df[0]
InĀ [21]:
#prepare data for survival analysis
survival_df = clinical_df.join(patient_scores_df, how='inner')
time_col = 'OS_MONTHS'
event_col = 'OS_STATUS'
survival_df[time_col] = pd.to_numeric(survival_df[time_col], errors='coerce')
survival_df[event_col] = (survival_df[event_col] == '1:DECEASED').astype(int)
survival_df.dropna(subset=[time_col, event_col], inplace=True)
InĀ [22]:
#stratification analysis function
def run_stratification_analysis(signature_name, survival_data, time_column, event_column):
survival_data = survival_data[[time_column, event_column, signature_name]].dropna()
median_score = survival_data[signature_name].median()
survival_data[f'{signature_name}_group'] = ['High' if score > median_score else 'Low' for score in survival_data[signature_name]]
T = survival_data[time_column]
E = survival_data[event_column]
groups = survival_data[f'{signature_name}_group']
high_mask = (groups == 'High')
low_mask = (groups == 'Low')
results = logrank_test(T[high_mask], T[low_mask], event_observed_A=E[high_mask], event_observed_B=E[low_mask])
p_value = results.p_value
print(f"Log-rank test p-value: {p_value:.4f}")
kmf = KaplanMeierFitter()
fig, ax = plt.subplots(1, 1, figsize=(6, 5))
kmf.fit(T[high_mask], event_observed=E[high_mask], label=f'High {signature_name} Score')
kmf.plot_survival_function(ax=ax)
kmf.fit(T[low_mask], event_observed=E[low_mask], label=f'Low {signature_name} Score')
kmf.plot_survival_function(ax=ax)
ax.set_title(f'AML Patient Survival by {signature_name} Signature')
ax.set_xlabel('Overall Survival (Months)')
ax.set_ylabel('Survival Probability')
ax.text(0.1, 0.1, f'Log-rank p = {p_value:.4f}', transform=ax.transAxes)
plt.tight_layout()
plt.savefig(f'preleuk_dashboard/plots/{signature_name}_survival_plot.png', dpi=300)
plt.show()
run_stratification_analysis('PLPS', survival_df.copy(), time_col, event_col)
run_stratification_analysis('Stem11', survival_df.copy(), time_col, event_col)
Log-rank test p-value: 0.8430
Log-rank test p-value: 0.1106
InĀ [23]:
adata_to_export = adata
umap_coords = pd.DataFrame(adata_to_export.obsm['X_umap'], index=adata_to_export.obs_names)
umap_coords.columns = ['UMAP_1', 'UMAP_2']
cell_metadata = adata_to_export.obs[['celltype', 'condition']]
df_for_shiny = pd.concat([umap_coords, cell_metadata], axis=1)
df_for_shiny.to_csv('preleuk_dashboard/data/umap_data_for_shiny.csv')