Include prelim analysis script for Xe D202

97f23cc9 · John Lafin · d375e704 · 97f23cc9
Commit 97f23cc9 authored 5 months ago by John Lafin
--- a/scripts/xenium_analysis_D202.py
+++ b/scripts/xenium_analysis_D202.py
+#!/usr/bin/env python3
+# Run initial analysis of Xenium data
+# Author: John T Lafin
+# Updated: 2024-12-27
+# Import modules
+import os
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import scanpy as sc
+import spatialdata as sd
+from spatialdata_io import xenium
+import sopa
+# Set parameters
+xenium_data='/archive/urology/Strand_lab/shared/xenium/output-XETG00248__0034053__D202__20240425__175357'
+zarr_dir='data_modified/spatialdata_zarr/'
+sample_name='D202'
+outdir='data_modified/xenium_'+sample_name+'/'
+os.makedirs(outdir, exist_ok=True)
+# Load data and create zarr store
+zarr_store=zarr_dir+sample_name+'.zarr'
+sdata = xenium(xenium_data, n_jobs=8)
+os.makedirs(zarr_dir, exist_ok=True)
+sdata.write(zarr_store)
+sdata = sd.read_zarr(zarr_store)
+# Pull the anndata object and calculate QC metrics
+adata = sdata['table']
+sc.pp.calculate_qc_metrics(adata, percent_top=(10, 20, 50, 150), inplace=True)
+# Save plots
+fig, axs = plt.subplots(1, 4, figsize=(15, 4))
+axs[0].set_title("Total transcripts per cell")
+sns.histplot(
+    adata.obs["total_counts"],
+    binwidth=5,
+    kde=False,
+    ax=axs[0],
+)
+axs[1].set_title("Unique transcripts per cell")
+sns.histplot(
+    adata.obs["n_genes_by_counts"],
+    binwidth=5,
+    kde=False,
+    ax=axs[1],
+)
+axs[2].set_title("Area of segmented cells")
+sns.histplot(
+    adata.obs["cell_area"],
+    binwidth=5,
+    kde=False,
+    ax=axs[2],
+)
+axs[3].set_title("Nucleus ratio")
+sns.histplot(
+    adata.obs["nucleus_area"] / adata.obs["cell_area"],
+    kde=False,
+    ax=axs[3],
+)
+plt.savefig(fname=outdir+'qc_plots.png')
+# Basic filtering
+sc.pp.filter_cells(adata, min_counts=10)
+sc.pp.filter_genes(adata, min_cells=5)
+# Save raw counts
+adata.layers['counts'] = adata.X.copy()
+# Pre-processing
+sc.pp.normalize_total(adata)
+sc.pp.log1p(adata)
+sc.pp.pca(adata)
+sc.pp.neighbors(adata)
+sc.tl.umap(adata)
+## Leiden clustering at 15 resolutions
+res = np.arange(0.1, 1.6, 0.1)
+res = np.round(res, 1)
+for i in res:
+    sc.tl.leiden(
+        adata,
+        resolution=i,
+        key_added="leiden_res_" + str(i),
+        flavor='igraph',
+        n_iterations=2
+    )
+# Create UMAP plots
+grps=[ 'leiden_res_'+str(x) for x in res ]
+sc.pl.umap(
+    adata,
+    color=grps,
+    ncols=3,
+    legend_loc='on data'
+)
+plt.savefig(fname=outdir+'umap_leiden.png')
+# Check gene expression
+genes=['EPCAM','KRT5','NKX3-1','PIGR','AKR1C2','DCN','ACTA2','DES','CDH19','PECAM1','PTPRC']
+sc.pl.umap(
+    adata,
+    color=genes,
+    ncols=3
+    )
+plt.savefig(fname=outdir+'umap_lineage_markers.png')
+# Save modified data
+adata.write_h5ad(outdir+sample_name+'_annotated.h5ad')
+sopa.io.write_cell_categories(outdir, adata)
+os.rename(src=outdir+'analysis.zarr.zip', dst=outdir+sample_name+'_annot_analysis.zarr.zip')