diff --git a/scripts/xenium_analysis_D216.py b/scripts/xenium_analysis_D216.py new file mode 100644 index 0000000000000000000000000000000000000000..18a199125e37b918ee29a165b0d726397b5127cd --- /dev/null +++ b/scripts/xenium_analysis_D216.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 + +# Run initial analysis of Xenium data +# Author: John T Lafin +# Updated: 2024-11-26 + +# Import modules +import os +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns +import scanpy as sc +import spatialdata as sd +from spatialdata_io import xenium +import sopa + +# Set parameters +xenium_data='/archive/urology/Strand_lab/shared/xenium/output-XETG00248__0034066__D216__20240425__175357/output-XETG00248__0034066__D216__20240425__175357' +zarr_dir='data_modified/spatialdata_zarr/' +sample_name='D216' +outdir='data_modified/xenium_D216/' +os.makedirs(outdir, exist_ok=True) + +# Load data and create zarr store +zarr_store=zarr_dir+sample_name+'.zarr' + +sdata = xenium(xenium_data, n_jobs=8) +os.makedirs(zarr_dir, exist_ok=True) +sdata.write(zarr_store) +sdata = sd.read_zarr(zarr_store) + +# Pull the anndata object and calculate QC metrics +adata = sdata['table'] +sc.pp.calculate_qc_metrics(adata, percent_top=(10, 20, 50, 150), inplace=True) + +# Save plots +fig, axs = plt.subplots(1, 4, figsize=(15, 4)) + +axs[0].set_title("Total transcripts per cell") +sns.histplot( + adata.obs["total_counts"], + binwidth=5, + kde=False, + ax=axs[0], +) + +axs[1].set_title("Unique transcripts per cell") +sns.histplot( + adata.obs["n_genes_by_counts"], + binwidth=5, + kde=False, + ax=axs[1], +) + + +axs[2].set_title("Area of segmented cells") +sns.histplot( + adata.obs["cell_area"], + binwidth=5, + kde=False, + ax=axs[2], +) + +axs[3].set_title("Nucleus ratio") +sns.histplot( + adata.obs["nucleus_area"] / adata.obs["cell_area"], + kde=False, + ax=axs[3], +) +plt.savefig(fname=outdir+'qc_plots.png') + +# Basic filtering +sc.pp.filter_cells(adata, min_counts=10) +sc.pp.filter_genes(adata, min_cells=5) + +# Save raw counts +adata.layers['counts'] = adata.X.copy() + +# Pre-processing +sc.pp.normalize_total(adata) +sc.pp.log1p(adata) +sc.pp.pca(adata) +sc.pp.neighbors(adata) +sc.tl.umap(adata) + +## Leiden clustering at 15 resolutions +res = np.arange(0.1, 1.6, 0.1) +res = np.round(res, 1) +for i in res: + sc.tl.leiden( + adata, + resolution=i, + key_added="leiden_res_" + str(i), + flavor='igraph', + n_iterations=2 + ) + +# Create UMAP plots +umap_dir=outdir+'umaps/' +os.makedirs(umap_dir, exist_ok=True) +for i in res: + sc.pl.umap( + adata, + color='leiden_res_'+str(i), + save=umap_dir+'leiden_res_'+str(i)+'.png' + ) + +# Check gene expression +genes=['EPCAM','KRT5','NKX3-1','PIGR','AKR1C2','DCN','ACTA2','DES','CDH19','PECAM1','PTPRC'] +sc.pl.umap( + adata, + color=genes, + save=umap_dir+'lineage_markers.png' + ) + +# Save modified data +adata.write_h5ad(outdir+sample_name+'_annotated.h5ad') +sopa.io.write_cell_categories(outdir, adata) +os.rename(src=outdir+'analysis.zarr.zip', dst=outdir+sample_name+'_annot_analysis.zarr.zip') diff --git a/scripts/xenium_analysis_D330.py b/scripts/xenium_analysis_D330.py new file mode 100644 index 0000000000000000000000000000000000000000..fb5f683b70243d65c75d0f12d139dbe3c2d50f47 --- /dev/null +++ b/scripts/xenium_analysis_D330.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 + +# Run initial analysis of Xenium data +# Author: John T Lafin +# Updated: 2024-11-26 + +# Import modules +import os +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns +import scanpy as sc +import spatialdata as sd +from spatialdata_io import xenium +import sopa + +# Set parameters +xenium_data='/archive/urology/Strand_lab/shared/xenium/output-XETG00248__0033934__D330-FLUT__20240718__174131' +zarr_dir='data_modified/spatialdata_zarr/' +sample_name='D330' +outdir='data_modified/xenium_'+sample_name+'/' +os.makedirs(outdir, exist_ok=True) + +# Load data and create zarr store +zarr_store=zarr_dir+sample_name+'.zarr' + +sdata = xenium(xenium_data, n_jobs=8) +os.makedirs(zarr_dir, exist_ok=True) +sdata.write(zarr_store) +sdata = sd.read_zarr(zarr_store) + +# Pull the anndata object and calculate QC metrics +adata = sdata['table'] +sc.pp.calculate_qc_metrics(adata, percent_top=(10, 20, 50, 150), inplace=True) + +# Save plots +fig, axs = plt.subplots(1, 4, figsize=(15, 4)) + +axs[0].set_title("Total transcripts per cell") +sns.histplot( + adata.obs["total_counts"], + binwidth=5, + kde=False, + ax=axs[0], +) + +axs[1].set_title("Unique transcripts per cell") +sns.histplot( + adata.obs["n_genes_by_counts"], + binwidth=5, + kde=False, + ax=axs[1], +) + + +axs[2].set_title("Area of segmented cells") +sns.histplot( + adata.obs["cell_area"], + binwidth=5, + kde=False, + ax=axs[2], +) + +axs[3].set_title("Nucleus ratio") +sns.histplot( + adata.obs["nucleus_area"] / adata.obs["cell_area"], + kde=False, + ax=axs[3], +) +plt.savefig(fname=outdir+'qc_plots.png') + +# Basic filtering +sc.pp.filter_cells(adata, min_counts=10) +sc.pp.filter_genes(adata, min_cells=5) + +# Save raw counts +adata.layers['counts'] = adata.X.copy() + +# Pre-processing +sc.pp.normalize_total(adata) +sc.pp.log1p(adata) +sc.pp.pca(adata) +sc.pp.neighbors(adata) +sc.tl.umap(adata) + +## Leiden clustering at 15 resolutions +res = np.arange(0.1, 1.6, 0.1) +res = np.round(res, 1) +for i in res: + sc.tl.leiden( + adata, + resolution=i, + key_added="leiden_res_" + str(i), + flavor='igraph', + n_iterations=2 + ) + +# Create UMAP plots +grps=[ 'leiden_res_'+str(x) for x in res ] +sc.pl.umap( + adata, + color=grps, + ncols=3, + legend_loc='on data' +) +plt.savefig(fname=outdir+'umap_leiden.png') + +# Check gene expression +genes=['EPCAM','KRT5','NKX3-1','PIGR','AKR1C2','DCN','ACTA2','DES','CDH19','PECAM1','PTPRC'] +sc.pl.umap( + adata, + color=genes, + ncols=3 + ) +plt.savefig(fname=outdir+'umap_lineage_markers.png') + +# Save modified data +adata.write_h5ad(outdir+sample_name+'_annotated.h5ad') +sopa.io.write_cell_categories(outdir, adata) +os.rename(src=outdir+'analysis.zarr.zip', dst=outdir+sample_name+'_annot_analysis.zarr.zip') diff --git a/scripts/xenium_analysis_D93.py b/scripts/xenium_analysis_D93.py new file mode 100644 index 0000000000000000000000000000000000000000..cdf8045fc6d1510bd5b462be6f9bd3e735ede7f4 --- /dev/null +++ b/scripts/xenium_analysis_D93.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 + +# Run initial analysis of Xenium data +# Author: John T Lafin +# Updated: 2024-11-26 + +# Import modules +import os +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns +import scanpy as sc +import spatialdata as sd +from spatialdata_io import xenium +import sopa + +# Set parameters +xenium_data='/archive/urology/Strand_lab/shared/xenium/output-XETG00248__0033827__D93-MULT__20240718__174131' +zarr_dir='data_modified/spatialdata_zarr/' +sample_name='D93' +outdir='data_modified/xenium_'+sample_name+'/' +os.makedirs(outdir, exist_ok=True) + +# Load data and create zarr store +zarr_store=zarr_dir+sample_name+'.zarr' + +sdata = xenium(xenium_data, n_jobs=8) +os.makedirs(zarr_dir, exist_ok=True) +sdata.write(zarr_store) +sdata = sd.read_zarr(zarr_store) + +# Pull the anndata object and calculate QC metrics +adata = sdata['table'] +sc.pp.calculate_qc_metrics(adata, percent_top=(10, 20, 50, 150), inplace=True) + +# Save plots +fig, axs = plt.subplots(1, 4, figsize=(15, 4)) + +axs[0].set_title("Total transcripts per cell") +sns.histplot( + adata.obs["total_counts"], + binwidth=5, + kde=False, + ax=axs[0], +) + +axs[1].set_title("Unique transcripts per cell") +sns.histplot( + adata.obs["n_genes_by_counts"], + binwidth=5, + kde=False, + ax=axs[1], +) + + +axs[2].set_title("Area of segmented cells") +sns.histplot( + adata.obs["cell_area"], + binwidth=5, + kde=False, + ax=axs[2], +) + +axs[3].set_title("Nucleus ratio") +sns.histplot( + adata.obs["nucleus_area"] / adata.obs["cell_area"], + kde=False, + ax=axs[3], +) +plt.savefig(fname=outdir+'qc_plots.png') + +# Basic filtering +sc.pp.filter_cells(adata, min_counts=10) +sc.pp.filter_genes(adata, min_cells=5) + +# Save raw counts +adata.layers['counts'] = adata.X.copy() + +# Pre-processing +sc.pp.normalize_total(adata) +sc.pp.log1p(adata) +sc.pp.pca(adata) +sc.pp.neighbors(adata) +sc.tl.umap(adata) + +## Leiden clustering at 15 resolutions +res = np.arange(0.1, 1.6, 0.1) +res = np.round(res, 1) +for i in res: + sc.tl.leiden( + adata, + resolution=i, + key_added="leiden_res_" + str(i), + flavor='igraph', + n_iterations=2 + ) + +# Create UMAP plots +grps=[ 'leiden_res_'+str(x) for x in res ] +sc.pl.umap( + adata, + color=grps, + ncols=3, + legend_loc='on data' +) +plt.savefig(fname=outdir+'umap_leiden.png') + +# Check gene expression +genes=['EPCAM','KRT5','NKX3-1','PIGR','AKR1C2','DCN','ACTA2','DES','CDH19','PECAM1','PTPRC'] +sc.pl.umap( + adata, + color=genes, + ncols=3 + ) +plt.savefig(fname=outdir+'umap_lineage_markers.png') + +# Save modified data +adata.write_h5ad(outdir+sample_name+'_annotated.h5ad') +sopa.io.write_cell_categories(outdir, adata) +os.rename(src=outdir+'analysis.zarr.zip', dst=outdir+sample_name+'_annot_analysis.zarr.zip')