1. Data preprocessing and curation

This notebook focuses on the preparation of LPS datasets, including both public and private sources, and demonstrates how to merge them for training the PerturbGen model.

1.1. Public LPS data

import gdown # for downloading files from Google Drive
import requests

Downloading the public LPS dataset

url = 'https://ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/026/E-MTAB-10026/Files/covid_portal_210320_with_raw.h5ad'
output= './Raw_h5ad'
gdown.download(url,output) # download the file

Downloading...
From: https://ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/026/E-MTAB-10026/Files/covid_portal_210320_with_raw.h5ad
To: /home/jovyan/farm_mount/Cytomeister/Evaluation_datasets/LPS/Public_Emily/Raw_h5ad.zip
100%|██████████| 7.19G/7.19G [01:31<00:00, 78.9MB/s]

'./Raw_h5ad.zip'

Import Libraries

import pandas as pd
import scanpy as sc
import numpy as np
import seaborn as sns

# Read the downloaded h5ad file
adata = sc.read('./Raw.h5ad')

Here, we investigate the .h5ad file to identify LPS samples and healthy control cells in the downloaded LPS data.

adata

AnnData object with n_obs × n_vars = 647366 × 24929
    obs: 'sample_id', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'full_clustering', 'initial_clustering', 'Resample', 'Collection_Day', 'Sex', 'Age_interval', 'Swab_result', 'Status', 'Smoker', 'Status_on_day_collection', 'Status_on_day_collection_summary', 'Days_from_onset', 'Site', 'time_after_LPS', 'Worst_Clinical_Status', 'Outcome', 'patient_id'
    var: 'feature_types'
    uns: 'hvg', 'leiden', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    layers: 'raw'

adata.obs['time_after_LPS'].value_counts() # check time points available

time_after_LPS
nan    639482
90m      3999
10h      3885
Name: count, dtype: int64

adata.obs['Status'].value_counts() # check conditions available

Status
Covid        527286
Healthy       97039
Non_covid     15157
LPS            7884
Name: count, dtype: int64

lps_sample_ids = adata.obs.loc[adata.obs['Status'] == 'LPS', 'patient_id'].unique() # get patient ids for LPS samples

healthy_sample_ids = adata.obs.loc[adata.obs['Status'] == 'Healthy', 'patient_id'].unique() # get patient ids for Healthy samples

adata = adata[adata.obs['Status'].isin(['LPS','Healthy'])].copy() # keep only LPS and Healthy samples

Next, we apply a set of modifications and add details to make downstream analyses more convenient e.g. converting time obs into categorical obs

adata.obs['time_after_LPS'] = adata.obs['time_after_LPS'].astype(str) # convert to string
adata.obs['time_after_LPS'] = adata.obs['time_after_LPS'].replace('nan', 'normal') # replace nan with normal

adata.obs['time_after_LPS'] = adata.obs['time_after_LPS'].apply(lambda x: x if x == 'normal' else f'{x}_LPS')

adata.obs['time_after_LPS'] = pd.Categorical(
    adata.obs['time_after_LPS'],
    categories=['normal', '90m_LPS', '10h_LPS'],
    ordered=True
)

adata.obs['time_after_LPS'].value_counts()

time_after_LPS
normal     97039
90m_LPS     3999
10h_LPS     3885
Name: count, dtype: int64

adata.obs['donor_id'] = adata.obs['patient_id']

adata.obs['study'] = 'Public_Emily2021'

adata.obs['Age_interval'].value_counts()

Age_interval
(30, 39]    30538
(50, 59]    26344
(20, 29]    20426
(40, 49]    15926
(60, 69]     9970
(70, 79]     1719
Name: count, dtype: int64

mapping = {
    '(20, 29]': 'Adult',      
    '(30, 39]': 'Adult',
    '(40, 49]': 'Adult',
    '(50, 59]': 'Adult',
    '(60, 69]': 'Adult',
    '(70, 79]': 'Old'
}

adata.obs['life_stage'] = adata.obs['Age_interval'].map(mapping)


adata.obs['life_stage'] = pd.Categorical(
    adata.obs['life_stage'],
    categories=['Embryo', 'Fetal', 'Childhood', 'Young Adult', 'Adult', 'Old'],
    ordered=True
)

adata.obs['development_stage'] = adata.obs['life_stage']

adata.obs['tissue'] = 'blood'

adata.X = adata.layers['raw'].copy()

adata.X.max()

np.float32(275279.0)

adata.obs['Resample']

covid_index
AAACCTGAGACCACGA-newcastle65    Initial
AAACCTGAGATGTCGG-newcastle65    Initial
AAACCTGAGGCGATAC-newcastle65    Initial
AAACCTGAGTACACCT-newcastle65    Initial
AAACCTGAGTGAATTG-newcastle65    Initial
                                 ...   
BGCV15_TTTCCTCTCTGATACG-1       Initial
BGCV15_TTTCCTCTCTTTAGGG-1       Initial
BGCV15_TTTGCGCTCACCGTAA-1       Initial
BGCV15_TTTGGTTTCAAGATCC-1       Initial
BGCV15_TTTGTCACAAGCCATT-1       Initial
Name: Resample, Length: 104923, dtype: category
Categories (1, object): ['Initial']

# Subset to Gene Expression features
adata = adata[:, adata.var['feature_types'] == 'Gene Expression']

ref = pd.read_csv("../../../../Ensembl_symbol_Human_(GRCh38.p14)/mart_export.txt")  

adata.var

	feature_types
MIR1302-2HG	Gene Expression
AL627309.1	Gene Expression
AL627309.3	Gene Expression
AL627309.2	Gene Expression
AL669831.2	Gene Expression
...	...
AC007325.2	Gene Expression
AL354822.1	Gene Expression
AC233755.2	Gene Expression
AC233755.1	Gene Expression
AC240274.1	Gene Expression

24737 rows × 1 columns

# 1. Convert to string and clean
ref.index = ref.index.astype(str).str.strip().str.upper()
ref["HGNC symbol"] = ref["HGNC symbol"].astype(str).str.strip().str.upper()

# Drop NaNs in gene_symbol (after converting to str 'nan', you can use replace)
ref = ref.replace({"HGNC symbol": {"NAN": None}}).dropna(subset=["HGNC symbol"])

# 2. Map gene_symbol -> ensembl_id by first occurrence
ref_rev = (
    ref.reset_index()
       .drop_duplicates(subset=["HGNC symbol"])
       .set_index("HGNC symbol")["Gene stable ID"]
)

# 3. Map gene_symbol from adata.var_names to ensembl_id
adata.var["gene_symbol"] = adata.var_names.str.strip().str.upper()
adata.var["ensembl_id"] = adata.var["gene_symbol"].map(ref_rev)

# 4. Filter genes without mapping
adata = adata[:, adata.var["ensembl_id"].notna()].copy()

# 5. Set var_names to ensembl_id
adata.var_names = adata.var["ensembl_id"]

print(f"adata now has {adata.n_vars} genes with Ensembl IDs as var_names.")

/tmp/ipykernel_962786/4272093940.py:16: ImplicitModificationWarning: Trying to modify attribute `.var` of view, initializing view as actual.
  adata.var["gene_symbol"] = adata.var_names.str.strip().str.upper()

adata now has 19193 genes with Ensembl IDs as var_names.

adata.var.index.name = 'gene_id'

adata.var

	feature_types	gene_symbol	ensembl_id
gene_id
ENSG00000243485	Gene Expression	MIR1302-2HG	ENSG00000243485
ENSG00000177757	Gene Expression	FAM87B	ENSG00000177757
ENSG00000225880	Gene Expression	LINC00115	ENSG00000225880
ENSG00000230368	Gene Expression	FAM41C	ENSG00000230368
ENSG00000187634	Gene Expression	SAMD11	ENSG00000187634
...	...	...	...
ENSG00000198886	Gene Expression	MT-ND4	ENSG00000198886
ENSG00000198786	Gene Expression	MT-ND5	ENSG00000198786
ENSG00000198695	Gene Expression	MT-ND6	ENSG00000198695
ENSG00000198727	Gene Expression	MT-CYB	ENSG00000198727
ENSG00000274847	Gene Expression	MAFIP	ENSG00000274847

19193 rows × 3 columns

adata.obs['development_stage']

covid_index
AAACCTGAGACCACGA-newcastle65    Adult
AAACCTGAGATGTCGG-newcastle65    Adult
AAACCTGAGGCGATAC-newcastle65    Adult
AAACCTGAGTACACCT-newcastle65    Adult
AAACCTGAGTGAATTG-newcastle65    Adult
                                ...  
BGCV15_TTTCCTCTCTGATACG-1       Adult
BGCV15_TTTCCTCTCTTTAGGG-1       Adult
BGCV15_TTTGCGCTCACCGTAA-1       Adult
BGCV15_TTTGGTTTCAAGATCC-1       Adult
BGCV15_TTTGTCACAAGCCATT-1       Adult
Name: development_stage, Length: 104923, dtype: category
Categories (2, object): ['Adult' < 'Old']

adata.obs['full_clustering'].value_counts()

full_clustering
CD4.Naive                13829
NK_16hi                  13473
CD8.Naive                 8105
CD4.CM                    7505
B_naive                   6908
CD83_CD14_mono            6772
CD8.TE                    6501
CD4.IL22                  6379
CD8.EM                    5734
gdT                       4948
CD14_mono                 3774
MAIT                      3465
CD16_mono                 3007
Platelets                 2363
NK_56hi                   2190
B_switched_memory         1395
DC2                       1052
DC3                        975
B_immature                 966
NKT                        864
CD4.Tfh                    800
pDC                        678
B_non-switched_memory      647
B_exhausted                334
RBC                        301
NK_prolif                  221
ILC1_3                     213
C1_CD16_mono               198
CD4.EM                     188
Plasma_cell_IgA            173
Plasma_cell_IgG            168
DC1                        104
CD4.Th1                    101
Plasmablast                 95
Plasma_cell_IgM             84
Treg                        75
HSC_erythroid               74
CD8.Prolif                  70
HSC_CD38pos                 61
ILC2                        36
CD4.Prolif                  32
CD4.Th2                     16
ASDC                        16
HSC_CD38neg                 13
DC_prolif                    6
HSC_prolif                   6
Mono_prolif                  4
HSC_myeloid                  3
CD4.Th17                     1
Name: count, dtype: int64

adata.obs['cell_type'] = adata.obs['full_clustering'].copy()

cell_type_counts = adata.obs['cell_type'].value_counts()

valid_cell_types = cell_type_counts[cell_type_counts >= 50].index

adata = adata[adata.obs['cell_type'].isin(valid_cell_types)].copy()

adata.obs['time_after_LPS']

covid_index
AAACCTGAGACCACGA-newcastle65    normal
AAACCTGAGATGTCGG-newcastle65    normal
AAACCTGAGGCGATAC-newcastle65    normal
AAACCTGAGTACACCT-newcastle65    normal
AAACCTGAGTGAATTG-newcastle65    normal
                                 ...  
BGCV15_TTTCCTCTCTGATACG-1       normal
BGCV15_TTTCCTCTCTTTAGGG-1       normal
BGCV15_TTTGCGCTCACCGTAA-1       normal
BGCV15_TTTGGTTTCAAGATCC-1       normal
BGCV15_TTTGTCACAAGCCATT-1       normal
Name: time_after_LPS, Length: 104790, dtype: category
Categories (3, object): ['normal' < '90m_LPS' < '10h_LPS']

# If data is sparse, it's best to use .X.sum(axis=1).A1
if hasattr(adata.X, 'sum'):
    adata.obs['n_counts'] = np.ravel(adata.X.sum(axis=1))
else:
    adata.obs['n_counts'] = adata.X.sum(axis=1)

adata.obs['n_counts']

covid_index
AAACCTGAGACCACGA-newcastle65    4242.0
AAACCTGAGATGTCGG-newcastle65    4708.0
AAACCTGAGGCGATAC-newcastle65    2858.0
AAACCTGAGTACACCT-newcastle65    5227.0
AAACCTGAGTGAATTG-newcastle65    4012.0
                                 ...  
BGCV15_TTTCCTCTCTGATACG-1       5766.0
BGCV15_TTTCCTCTCTTTAGGG-1       7415.0
BGCV15_TTTGCGCTCACCGTAA-1       5168.0
BGCV15_TTTGGTTTCAAGATCC-1       6504.0
BGCV15_TTTGTCACAAGCCATT-1       5246.0
Name: n_counts, Length: 104790, dtype: float32

Eventually, we save the public data

adata.write_h5ad('./Public_LPS_PP.h5ad')

1.2. Private LPS data

This data will be published after the paper publication.

tcell=sc.read_h5ad('/path/to/data1/tcell_raw_matrix_annotated.h5ad')

tcell

AnnData object with n_obs × n_vars = 69370 × 35976
    obs: 'patient_id', 'time_after_LPS', 'Sex', 'age', 'batch', 'IR_VJ_1_locus_tcr', 'IR_VJ_2_locus_tcr', 'IR_VDJ_1_locus_tcr', 'IR_VDJ_2_locus_tcr', 'IR_VJ_1_cdr3_tcr', 'IR_VJ_2_cdr3_tcr', 'IR_VDJ_1_cdr3_tcr', 'IR_VDJ_2_cdr3_tcr', 'IR_VJ_1_cdr3_nt_tcr', 'IR_VJ_2_cdr3_nt_tcr', 'IR_VDJ_1_cdr3_nt_tcr', 'IR_VDJ_2_cdr3_nt_tcr', 'IR_VJ_1_expr_tcr', 'IR_VJ_2_expr_tcr', 'IR_VDJ_1_expr_tcr', 'IR_VDJ_2_expr_tcr', 'IR_VJ_1_expr_raw_tcr', 'IR_VJ_2_expr_raw_tcr', 'IR_VDJ_1_expr_raw_tcr', 'IR_VDJ_2_expr_raw_tcr', 'IR_VJ_1_v_gene_tcr', 'IR_VJ_2_v_gene_tcr', 'IR_VDJ_1_v_gene_tcr', 'IR_VDJ_2_v_gene_tcr', 'IR_VJ_1_d_gene_tcr', 'IR_VJ_2_d_gene_tcr', 'IR_VDJ_1_d_gene_tcr', 'IR_VDJ_2_d_gene_tcr', 'IR_VJ_1_j_gene_tcr', 'IR_VJ_2_j_gene_tcr', 'IR_VDJ_1_j_gene_tcr', 'IR_VDJ_2_j_gene_tcr', 'IR_VJ_1_c_gene_tcr', 'IR_VJ_2_c_gene_tcr', 'IR_VDJ_1_c_gene_tcr', 'IR_VDJ_2_c_gene_tcr', 'IR_VJ_1_junction_ins_tcr', 'IR_VJ_2_junction_ins_tcr', 'IR_VDJ_1_junction_ins_tcr', 'IR_VDJ_2_junction_ins_tcr', 'has_ir_tcr', 'multi_chain_tcr', 'IR_VJ_1_locus_bcr', 'IR_VJ_2_locus_bcr', 'IR_VDJ_1_locus_bcr', 'IR_VDJ_2_locus_bcr', 'IR_VJ_1_cdr3_bcr', 'IR_VJ_2_cdr3_bcr', 'IR_VDJ_1_cdr3_bcr', 'IR_VDJ_2_cdr3_bcr', 'IR_VJ_1_cdr3_nt_bcr', 'IR_VJ_2_cdr3_nt_bcr', 'IR_VDJ_1_cdr3_nt_bcr', 'IR_VDJ_2_cdr3_nt_bcr', 'IR_VJ_1_expr_bcr', 'IR_VJ_2_expr_bcr', 'IR_VDJ_1_expr_bcr', 'IR_VDJ_2_expr_bcr', 'IR_VJ_1_expr_raw_bcr', 'IR_VJ_2_expr_raw_bcr', 'IR_VDJ_1_expr_raw_bcr', 'IR_VDJ_2_expr_raw_bcr', 'IR_VJ_1_v_gene_bcr', 'IR_VJ_2_v_gene_bcr', 'IR_VDJ_1_v_gene_bcr', 'IR_VDJ_2_v_gene_bcr', 'IR_VJ_1_d_gene_bcr', 'IR_VJ_2_d_gene_bcr', 'IR_VDJ_1_d_gene_bcr', 'IR_VDJ_2_d_gene_bcr', 'IR_VJ_1_j_gene_bcr', 'IR_VJ_2_j_gene_bcr', 'IR_VDJ_1_j_gene_bcr', 'IR_VDJ_2_j_gene_bcr', 'IR_VJ_1_c_gene_bcr', 'IR_VJ_2_c_gene_bcr', 'IR_VDJ_1_c_gene_bcr', 'IR_VDJ_2_c_gene_bcr', 'IR_VJ_1_junction_ins_bcr', 'IR_VJ_2_junction_ins_bcr', 'IR_VDJ_1_junction_ins_bcr', 'IR_VDJ_2_junction_ins_bcr', 'has_ir_bcr', 'multi_chain_bcr', 'scr_th', 'doublet_scores', 'cal_th', 'scr_predicted_doublets', 'cal_dobulets', 'data_batch', 'individual_id', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'cell_typist_full_predicted_labels', 'cell_typist_full_majority_voting', 'cell_typist_initial_predicted_labels', 'cell_typist_initial_majority_voting', 'final_anno', 'timeline', 'sample_id', 'all_T'
    var: 'gene_ids', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'GEX'

bcell=sc.read_h5ad('/path/to/data2/bcell_raw_matrix_annotated_allanno.h5ad')

bcell

AnnData object with n_obs × n_vars = 11964 × 36601
    obs: 'patient_id', 'time_after_LPS', 'Sex', 'age', 'batch', 'IR_VJ_1_locus_tcr', 'IR_VJ_2_locus_tcr', 'IR_VDJ_1_locus_tcr', 'IR_VDJ_2_locus_tcr', 'IR_VJ_1_cdr3_tcr', 'IR_VJ_2_cdr3_tcr', 'IR_VDJ_1_cdr3_tcr', 'IR_VDJ_2_cdr3_tcr', 'IR_VJ_1_cdr3_nt_tcr', 'IR_VJ_2_cdr3_nt_tcr', 'IR_VDJ_1_cdr3_nt_tcr', 'IR_VDJ_2_cdr3_nt_tcr', 'IR_VJ_1_expr_tcr', 'IR_VJ_2_expr_tcr', 'IR_VDJ_1_expr_tcr', 'IR_VDJ_2_expr_tcr', 'IR_VJ_1_expr_raw_tcr', 'IR_VJ_2_expr_raw_tcr', 'IR_VDJ_1_expr_raw_tcr', 'IR_VDJ_2_expr_raw_tcr', 'IR_VJ_1_v_gene_tcr', 'IR_VJ_2_v_gene_tcr', 'IR_VDJ_1_v_gene_tcr', 'IR_VDJ_2_v_gene_tcr', 'IR_VJ_1_d_gene_tcr', 'IR_VJ_2_d_gene_tcr', 'IR_VDJ_1_d_gene_tcr', 'IR_VDJ_2_d_gene_tcr', 'IR_VJ_1_j_gene_tcr', 'IR_VJ_2_j_gene_tcr', 'IR_VDJ_1_j_gene_tcr', 'IR_VDJ_2_j_gene_tcr', 'IR_VJ_1_c_gene_tcr', 'IR_VJ_2_c_gene_tcr', 'IR_VDJ_1_c_gene_tcr', 'IR_VDJ_2_c_gene_tcr', 'IR_VJ_1_junction_ins_tcr', 'IR_VJ_2_junction_ins_tcr', 'IR_VDJ_1_junction_ins_tcr', 'IR_VDJ_2_junction_ins_tcr', 'has_ir_tcr', 'multi_chain_tcr', 'IR_VJ_1_locus_bcr', 'IR_VJ_2_locus_bcr', 'IR_VDJ_1_locus_bcr', 'IR_VDJ_2_locus_bcr', 'IR_VJ_1_cdr3_bcr', 'IR_VJ_2_cdr3_bcr', 'IR_VDJ_1_cdr3_bcr', 'IR_VDJ_2_cdr3_bcr', 'IR_VJ_1_cdr3_nt_bcr', 'IR_VJ_2_cdr3_nt_bcr', 'IR_VDJ_1_cdr3_nt_bcr', 'IR_VDJ_2_cdr3_nt_bcr', 'IR_VJ_1_expr_bcr', 'IR_VJ_2_expr_bcr', 'IR_VDJ_1_expr_bcr', 'IR_VDJ_2_expr_bcr', 'IR_VJ_1_expr_raw_bcr', 'IR_VJ_2_expr_raw_bcr', 'IR_VDJ_1_expr_raw_bcr', 'IR_VDJ_2_expr_raw_bcr', 'IR_VJ_1_v_gene_bcr', 'IR_VJ_2_v_gene_bcr', 'IR_VDJ_1_v_gene_bcr', 'IR_VDJ_2_v_gene_bcr', 'IR_VJ_1_d_gene_bcr', 'IR_VJ_2_d_gene_bcr', 'IR_VDJ_1_d_gene_bcr', 'IR_VDJ_2_d_gene_bcr', 'IR_VJ_1_j_gene_bcr', 'IR_VJ_2_j_gene_bcr', 'IR_VDJ_1_j_gene_bcr', 'IR_VDJ_2_j_gene_bcr', 'IR_VJ_1_c_gene_bcr', 'IR_VJ_2_c_gene_bcr', 'IR_VDJ_1_c_gene_bcr', 'IR_VDJ_2_c_gene_bcr', 'IR_VJ_1_junction_ins_bcr', 'IR_VJ_2_junction_ins_bcr', 'IR_VDJ_1_junction_ins_bcr', 'IR_VDJ_2_junction_ins_bcr', 'has_ir_bcr', 'multi_chain_bcr', 'scr_th', 'doublet_scores', 'cal_th', 'scr_predicted_doublets', 'cal_dobulets', 'data_batch', 'individual_id', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'cell_typist_full_predicted_labels', 'cell_typist_full_majority_voting', 'cell_typist_initial_predicted_labels', 'cell_typist_initial_majority_voting', 'final_anno', 'final_anno1', 'timeline', 'sample_id', 'all_B'
    var: 'gene_ids', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'

myeloid=sc.read_h5ad('/path/to/data3/mcell_raw_matrix_annotated.h5ad')

combined_data=tcell.concatenate(bcell,myeloid) # combine the three datasets

/tmp/ipykernel_968620/3716584077.py:1: FutureWarning: Use anndata.concat instead of AnnData.concatenate, AnnData.concatenate is deprecated and will be removed in the future. See the tutorial for concat at: https://anndata.readthedocs.io/en/latest/concatenation.html
  combined_data=tcell.concatenate(bcell,myeloid)

combined_data # check combined data

AnnData object with n_obs × n_vars = 122263 × 35976
    obs: 'patient_id', 'time_after_LPS', 'Sex', 'age', 'batch', 'IR_VJ_1_locus_tcr', 'IR_VJ_2_locus_tcr', 'IR_VDJ_1_locus_tcr', 'IR_VDJ_2_locus_tcr', 'IR_VJ_1_cdr3_tcr', 'IR_VJ_2_cdr3_tcr', 'IR_VDJ_1_cdr3_tcr', 'IR_VDJ_2_cdr3_tcr', 'IR_VJ_1_cdr3_nt_tcr', 'IR_VJ_2_cdr3_nt_tcr', 'IR_VDJ_1_cdr3_nt_tcr', 'IR_VDJ_2_cdr3_nt_tcr', 'IR_VJ_1_expr_tcr', 'IR_VJ_2_expr_tcr', 'IR_VDJ_1_expr_tcr', 'IR_VDJ_2_expr_tcr', 'IR_VJ_1_expr_raw_tcr', 'IR_VJ_2_expr_raw_tcr', 'IR_VDJ_1_expr_raw_tcr', 'IR_VDJ_2_expr_raw_tcr', 'IR_VJ_1_v_gene_tcr', 'IR_VJ_2_v_gene_tcr', 'IR_VDJ_1_v_gene_tcr', 'IR_VDJ_2_v_gene_tcr', 'IR_VJ_1_d_gene_tcr', 'IR_VJ_2_d_gene_tcr', 'IR_VDJ_1_d_gene_tcr', 'IR_VDJ_2_d_gene_tcr', 'IR_VJ_1_j_gene_tcr', 'IR_VJ_2_j_gene_tcr', 'IR_VDJ_1_j_gene_tcr', 'IR_VDJ_2_j_gene_tcr', 'IR_VJ_1_c_gene_tcr', 'IR_VJ_2_c_gene_tcr', 'IR_VDJ_1_c_gene_tcr', 'IR_VDJ_2_c_gene_tcr', 'IR_VJ_1_junction_ins_tcr', 'IR_VJ_2_junction_ins_tcr', 'IR_VDJ_1_junction_ins_tcr', 'IR_VDJ_2_junction_ins_tcr', 'has_ir_tcr', 'multi_chain_tcr', 'IR_VJ_1_locus_bcr', 'IR_VJ_2_locus_bcr', 'IR_VDJ_1_locus_bcr', 'IR_VDJ_2_locus_bcr', 'IR_VJ_1_cdr3_bcr', 'IR_VJ_2_cdr3_bcr', 'IR_VDJ_1_cdr3_bcr', 'IR_VDJ_2_cdr3_bcr', 'IR_VJ_1_cdr3_nt_bcr', 'IR_VJ_2_cdr3_nt_bcr', 'IR_VDJ_1_cdr3_nt_bcr', 'IR_VDJ_2_cdr3_nt_bcr', 'IR_VJ_1_expr_bcr', 'IR_VJ_2_expr_bcr', 'IR_VDJ_1_expr_bcr', 'IR_VDJ_2_expr_bcr', 'IR_VJ_1_expr_raw_bcr', 'IR_VJ_2_expr_raw_bcr', 'IR_VDJ_1_expr_raw_bcr', 'IR_VDJ_2_expr_raw_bcr', 'IR_VJ_1_v_gene_bcr', 'IR_VJ_2_v_gene_bcr', 'IR_VDJ_1_v_gene_bcr', 'IR_VDJ_2_v_gene_bcr', 'IR_VJ_1_d_gene_bcr', 'IR_VJ_2_d_gene_bcr', 'IR_VDJ_1_d_gene_bcr', 'IR_VDJ_2_d_gene_bcr', 'IR_VJ_1_j_gene_bcr', 'IR_VJ_2_j_gene_bcr', 'IR_VDJ_1_j_gene_bcr', 'IR_VDJ_2_j_gene_bcr', 'IR_VJ_1_c_gene_bcr', 'IR_VJ_2_c_gene_bcr', 'IR_VDJ_1_c_gene_bcr', 'IR_VDJ_2_c_gene_bcr', 'IR_VJ_1_junction_ins_bcr', 'IR_VJ_2_junction_ins_bcr', 'IR_VDJ_1_junction_ins_bcr', 'IR_VDJ_2_junction_ins_bcr', 'has_ir_bcr', 'multi_chain_bcr', 'scr_th', 'doublet_scores', 'cal_th', 'scr_predicted_doublets', 'cal_dobulets', 'data_batch', 'individual_id', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'cell_typist_full_predicted_labels', 'cell_typist_full_majority_voting', 'cell_typist_initial_predicted_labels', 'cell_typist_initial_majority_voting', 'final_anno', 'timeline', 'sample_id', 'all_T', 'final_anno1', 'all_B'
    var: 'gene_ids', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'GEX-0', 'GEX-2'

Next, we apply a set of modifications, QC analyses and preprocessing as data is raw

# mitochondrial genes
combined_data.var["mt"] = combined_data.var_names.str.startswith("MT-")
# ribosomal genes
combined_data.var["ribo"] = combined_data.var_names.str.startswith(("RPS", "RPL"))
# hemoglobin genes.
combined_data.var["hb"] = combined_data.var_names.str.contains(("^HB[^(P)]"))

sc.pp.calculate_qc_metrics( # calculate QC metrics
    combined_data, qc_vars=["mt", "ribo", "hb"], inplace=True, percent_top=[20], log1p=True
)
combined_data

AnnData object with n_obs × n_vars = 122263 × 35976
    obs: 'patient_id', 'time_after_LPS', 'Sex', 'age', 'batch', 'IR_VJ_1_locus_tcr', 'IR_VJ_2_locus_tcr', 'IR_VDJ_1_locus_tcr', 'IR_VDJ_2_locus_tcr', 'IR_VJ_1_cdr3_tcr', 'IR_VJ_2_cdr3_tcr', 'IR_VDJ_1_cdr3_tcr', 'IR_VDJ_2_cdr3_tcr', 'IR_VJ_1_cdr3_nt_tcr', 'IR_VJ_2_cdr3_nt_tcr', 'IR_VDJ_1_cdr3_nt_tcr', 'IR_VDJ_2_cdr3_nt_tcr', 'IR_VJ_1_expr_tcr', 'IR_VJ_2_expr_tcr', 'IR_VDJ_1_expr_tcr', 'IR_VDJ_2_expr_tcr', 'IR_VJ_1_expr_raw_tcr', 'IR_VJ_2_expr_raw_tcr', 'IR_VDJ_1_expr_raw_tcr', 'IR_VDJ_2_expr_raw_tcr', 'IR_VJ_1_v_gene_tcr', 'IR_VJ_2_v_gene_tcr', 'IR_VDJ_1_v_gene_tcr', 'IR_VDJ_2_v_gene_tcr', 'IR_VJ_1_d_gene_tcr', 'IR_VJ_2_d_gene_tcr', 'IR_VDJ_1_d_gene_tcr', 'IR_VDJ_2_d_gene_tcr', 'IR_VJ_1_j_gene_tcr', 'IR_VJ_2_j_gene_tcr', 'IR_VDJ_1_j_gene_tcr', 'IR_VDJ_2_j_gene_tcr', 'IR_VJ_1_c_gene_tcr', 'IR_VJ_2_c_gene_tcr', 'IR_VDJ_1_c_gene_tcr', 'IR_VDJ_2_c_gene_tcr', 'IR_VJ_1_junction_ins_tcr', 'IR_VJ_2_junction_ins_tcr', 'IR_VDJ_1_junction_ins_tcr', 'IR_VDJ_2_junction_ins_tcr', 'has_ir_tcr', 'multi_chain_tcr', 'IR_VJ_1_locus_bcr', 'IR_VJ_2_locus_bcr', 'IR_VDJ_1_locus_bcr', 'IR_VDJ_2_locus_bcr', 'IR_VJ_1_cdr3_bcr', 'IR_VJ_2_cdr3_bcr', 'IR_VDJ_1_cdr3_bcr', 'IR_VDJ_2_cdr3_bcr', 'IR_VJ_1_cdr3_nt_bcr', 'IR_VJ_2_cdr3_nt_bcr', 'IR_VDJ_1_cdr3_nt_bcr', 'IR_VDJ_2_cdr3_nt_bcr', 'IR_VJ_1_expr_bcr', 'IR_VJ_2_expr_bcr', 'IR_VDJ_1_expr_bcr', 'IR_VDJ_2_expr_bcr', 'IR_VJ_1_expr_raw_bcr', 'IR_VJ_2_expr_raw_bcr', 'IR_VDJ_1_expr_raw_bcr', 'IR_VDJ_2_expr_raw_bcr', 'IR_VJ_1_v_gene_bcr', 'IR_VJ_2_v_gene_bcr', 'IR_VDJ_1_v_gene_bcr', 'IR_VDJ_2_v_gene_bcr', 'IR_VJ_1_d_gene_bcr', 'IR_VJ_2_d_gene_bcr', 'IR_VDJ_1_d_gene_bcr', 'IR_VDJ_2_d_gene_bcr', 'IR_VJ_1_j_gene_bcr', 'IR_VJ_2_j_gene_bcr', 'IR_VDJ_1_j_gene_bcr', 'IR_VDJ_2_j_gene_bcr', 'IR_VJ_1_c_gene_bcr', 'IR_VJ_2_c_gene_bcr', 'IR_VDJ_1_c_gene_bcr', 'IR_VDJ_2_c_gene_bcr', 'IR_VJ_1_junction_ins_bcr', 'IR_VJ_2_junction_ins_bcr', 'IR_VDJ_1_junction_ins_bcr', 'IR_VDJ_2_junction_ins_bcr', 'has_ir_bcr', 'multi_chain_bcr', 'scr_th', 'doublet_scores', 'cal_th', 'scr_predicted_doublets', 'cal_dobulets', 'data_batch', 'individual_id', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'cell_typist_full_predicted_labels', 'cell_typist_full_majority_voting', 'cell_typist_initial_predicted_labels', 'cell_typist_initial_majority_voting', 'final_anno', 'timeline', 'sample_id', 'all_T', 'final_anno1', 'all_B', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'log1p_total_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb'
    var: 'gene_ids', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'GEX-0', 'GEX-2', 'ribo', 'hb', 'log1p_mean_counts', 'log1p_total_counts'

Visualize QC metrics

p1 = sns.displot(combined_data.obs["total_counts"], bins=100, kde=False)
sc.pl.violin(combined_data, 'total_counts')
p2 = sc.pl.violin(combined_data, "pct_counts_mt")
p3 = sc.pl.scatter(combined_data, "total_counts", "n_genes_by_counts", color="pct_counts_mt")

../_images/508d701d2564581649c13f65416bc789e66263e61d7b82f81b31e710c7f0fb10.png

../_images/1d9f740c4df01412f623f868b103544cf394450894497dd079e1fb17674a5d26.png

../_images/40eebe636c8ab33ec882db282f8c63b91325bc02fc53bb1ee45b8ec06f474f57.png

../_images/9e3fe082184582e5b82e9b20a52ea4309887abe75cdeff00fd26d90ae414a971.png

combined_data = combined_data[combined_data.obs['pct_counts_mt'] <= 8] # filter cells with high mitochondrial content

sc.pp.filter_cells(combined_data, max_counts=150000) # filter cells with too many counts

/software/cellgen/team361/am74/envs/pertpy/lib/python3.9/site-packages/scanpy/preprocessing/_simple.py:165: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.
  adata.obs["n_counts"] = number

combined_data.obs['n_genes'] = (combined_data.X > 0).sum(1) # calculate number of genes per cell
sns.distplot(combined_data.obs['n_genes'], kde=False, bins=60) # visualize number of genes per cell

/tmp/ipykernel_968620/3987137127.py:2: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(combined_data.obs['n_genes'], kde=False, bins=60)

<Axes: xlabel='n_genes'>

../_images/30cca78160a12a0270c4d072b4f4ce81c37bc7a5d92981ffcb4646634f02c938.png

sc.pp.filter_genes(combined_data, min_cells=20) # filter genes expressed in less than 20 cells
print('Number of genes after cell filter: {:d}'.format(combined_data.n_vars)) 

Number of genes after cell filter: 19186

combined_data.obs['batch'] = combined_data.obs['batch'].cat.rename_categories({
    '0': 'T cell lineage',
    '1': 'B cell lineage',
    '2': 'Myeloid lineage'
})

combined_data.obs['final_anno'].value_counts()

final_anno
CD14mono                     23888
CD4_naive                    18268
CD8_naive                    13249
CD4_memory                   10259
CD8_EM                        8869
B_naive                       8185
Int.mono                      7229
NK_CD56dim                    4922
CD14mono_M1                   4129
gdT                           3325
MAIT                          2577
Treg                          2382
NKT                           2376
non-swtiched memory           1583
CD16mono                      1213
switched memory               1158
CD8_CM                        1004
DC2                            972
Platelet                       862
NK_CD56bright                  856
pDC                            696
DC3                            488
non-switched memory/CD11c      301
HSPC                           227
plasma_IgA                     160
DC1                             60
plasma_dividing                 50
plasma_IgG                      31
ASDC                            23
plasma_IgM                      18
Name: count, dtype: int64

combined_data.X.max() # check max counts after filtering

2717.0

combined_data.obs['cell_type'] = combined_data.obs['final_anno'] # set cell_type based on final_anno

combined_data.obs['patient_id']

AAACCTGAGACAAAGG-1-0    IVLPS01_Baseline
AAACCTGAGCGTTCCG-1-0    IVLPS01_Baseline
AAACCTGAGTGTTTGC-1-0    IVLPS01_Baseline
AAACCTGCAAACTGTC-1-0    IVLPS01_Baseline
AAACCTGCACAACGTT-1-0    IVLPS01_Baseline
                              ...       
TTTGGTTTCGGAATCT-1-2         IVLPS12_10h
TTTGTCAAGGCATGTG-1-2         IVLPS12_10h
TTTGTCACAAAGGAAG-1-2         IVLPS12_10h
TTTGTCATCAACACTG-1-2         IVLPS12_10h
TTTGTCATCACAACGT-1-2         IVLPS12_10h
Name: patient_id, Length: 119360, dtype: category
Categories (23, object): ['IVLPS01_10h', 'IVLPS01_6h', 'IVLPS01_90min', 'IVLPS01_Baseline', ..., 'IVLPS06_Baseline', 'IVLPS12_10h', 'IVLPS12_6h', 'IVLPS12_90min']

combined_data.obs['donor_id'] = combined_data.obs['patient_id'].copy()

combined_data.obs['time_after_LPS']

AAACCTGAGACAAAGG-1-0     0m
AAACCTGAGCGTTCCG-1-0     0m
AAACCTGAGTGTTTGC-1-0     0m
AAACCTGCAAACTGTC-1-0     0m
AAACCTGCACAACGTT-1-0     0m
                       ... 
TTTGGTTTCGGAATCT-1-2    10h
TTTGTCAAGGCATGTG-1-2    10h
TTTGTCACAAAGGAAG-1-2    10h
TTTGTCATCAACACTG-1-2    10h
TTTGTCATCACAACGT-1-2    10h
Name: time_after_LPS, Length: 119360, dtype: category
Categories (4, object): ['0m', '10h', '6h', '90m']

combined_data.obs['time_after_LPS'] = combined_data.obs['time_after_LPS'].cat.rename_categories({
    '0m': 'normal',
    '10h': '10h_LPS',
    '6h': '6h_LPS',
    '90m': '90m_LPS'
})

combined_data.obs['time_after_LPS']

AAACCTGAGACAAAGG-1-0     normal
AAACCTGAGCGTTCCG-1-0     normal
AAACCTGAGTGTTTGC-1-0     normal
AAACCTGCAAACTGTC-1-0     normal
AAACCTGCACAACGTT-1-0     normal
                         ...   
TTTGGTTTCGGAATCT-1-2    10h_LPS
TTTGTCAAGGCATGTG-1-2    10h_LPS
TTTGTCACAAAGGAAG-1-2    10h_LPS
TTTGTCATCAACACTG-1-2    10h_LPS
TTTGTCATCACAACGT-1-2    10h_LPS
Name: time_after_LPS, Length: 119360, dtype: category
Categories (4, object): ['normal', '10h_LPS', '6h_LPS', '90m_LPS']

combined_data.obs['study'] = 'Private_Data_Emily'

combined_data.obs['tissue'] = 'blood'

combined_data

AnnData object with n_obs × n_vars = 119360 × 19186
    obs: 'patient_id', 'time_after_LPS', 'Sex', 'age', 'batch', 'IR_VJ_1_locus_tcr', 'IR_VJ_2_locus_tcr', 'IR_VDJ_1_locus_tcr', 'IR_VDJ_2_locus_tcr', 'IR_VJ_1_cdr3_tcr', 'IR_VJ_2_cdr3_tcr', 'IR_VDJ_1_cdr3_tcr', 'IR_VDJ_2_cdr3_tcr', 'IR_VJ_1_cdr3_nt_tcr', 'IR_VJ_2_cdr3_nt_tcr', 'IR_VDJ_1_cdr3_nt_tcr', 'IR_VDJ_2_cdr3_nt_tcr', 'IR_VJ_1_expr_tcr', 'IR_VJ_2_expr_tcr', 'IR_VDJ_1_expr_tcr', 'IR_VDJ_2_expr_tcr', 'IR_VJ_1_expr_raw_tcr', 'IR_VJ_2_expr_raw_tcr', 'IR_VDJ_1_expr_raw_tcr', 'IR_VDJ_2_expr_raw_tcr', 'IR_VJ_1_v_gene_tcr', 'IR_VJ_2_v_gene_tcr', 'IR_VDJ_1_v_gene_tcr', 'IR_VDJ_2_v_gene_tcr', 'IR_VJ_1_d_gene_tcr', 'IR_VJ_2_d_gene_tcr', 'IR_VDJ_1_d_gene_tcr', 'IR_VDJ_2_d_gene_tcr', 'IR_VJ_1_j_gene_tcr', 'IR_VJ_2_j_gene_tcr', 'IR_VDJ_1_j_gene_tcr', 'IR_VDJ_2_j_gene_tcr', 'IR_VJ_1_c_gene_tcr', 'IR_VJ_2_c_gene_tcr', 'IR_VDJ_1_c_gene_tcr', 'IR_VDJ_2_c_gene_tcr', 'IR_VJ_1_junction_ins_tcr', 'IR_VJ_2_junction_ins_tcr', 'IR_VDJ_1_junction_ins_tcr', 'IR_VDJ_2_junction_ins_tcr', 'has_ir_tcr', 'multi_chain_tcr', 'IR_VJ_1_locus_bcr', 'IR_VJ_2_locus_bcr', 'IR_VDJ_1_locus_bcr', 'IR_VDJ_2_locus_bcr', 'IR_VJ_1_cdr3_bcr', 'IR_VJ_2_cdr3_bcr', 'IR_VDJ_1_cdr3_bcr', 'IR_VDJ_2_cdr3_bcr', 'IR_VJ_1_cdr3_nt_bcr', 'IR_VJ_2_cdr3_nt_bcr', 'IR_VDJ_1_cdr3_nt_bcr', 'IR_VDJ_2_cdr3_nt_bcr', 'IR_VJ_1_expr_bcr', 'IR_VJ_2_expr_bcr', 'IR_VDJ_1_expr_bcr', 'IR_VDJ_2_expr_bcr', 'IR_VJ_1_expr_raw_bcr', 'IR_VJ_2_expr_raw_bcr', 'IR_VDJ_1_expr_raw_bcr', 'IR_VDJ_2_expr_raw_bcr', 'IR_VJ_1_v_gene_bcr', 'IR_VJ_2_v_gene_bcr', 'IR_VDJ_1_v_gene_bcr', 'IR_VDJ_2_v_gene_bcr', 'IR_VJ_1_d_gene_bcr', 'IR_VJ_2_d_gene_bcr', 'IR_VDJ_1_d_gene_bcr', 'IR_VDJ_2_d_gene_bcr', 'IR_VJ_1_j_gene_bcr', 'IR_VJ_2_j_gene_bcr', 'IR_VDJ_1_j_gene_bcr', 'IR_VDJ_2_j_gene_bcr', 'IR_VJ_1_c_gene_bcr', 'IR_VJ_2_c_gene_bcr', 'IR_VDJ_1_c_gene_bcr', 'IR_VDJ_2_c_gene_bcr', 'IR_VJ_1_junction_ins_bcr', 'IR_VJ_2_junction_ins_bcr', 'IR_VDJ_1_junction_ins_bcr', 'IR_VDJ_2_junction_ins_bcr', 'has_ir_bcr', 'multi_chain_bcr', 'scr_th', 'doublet_scores', 'cal_th', 'scr_predicted_doublets', 'cal_dobulets', 'data_batch', 'individual_id', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'cell_typist_full_predicted_labels', 'cell_typist_full_majority_voting', 'cell_typist_initial_predicted_labels', 'cell_typist_initial_majority_voting', 'final_anno', 'timeline', 'sample_id', 'all_T', 'final_anno1', 'all_B', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'log1p_total_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'n_counts', 'cell_type', 'donor_id', 'study', 'tissue'
    var: 'gene_ids', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'GEX-0', 'GEX-2', 'ribo', 'hb', 'log1p_mean_counts', 'log1p_total_counts', 'n_cells'

combined_data.obs['age'].value_counts()

age
0    29443
0    29386
0    21621
0    18532
0    15735
0     4643
Name: count, dtype: int64

def age_to_group(age):
    if pd.isna(age):
        return 'Unknown'
    elif age < 0:
        # Prenatal age, optional handling
        return 'Prenatal'
    elif age <= 18:
        return 'Childhood'
    elif 18 < age <= 25:
        return 'Young Adult'
    elif 26 <= age <= 64:
        return 'Adult'
    elif age >= 65:
        return 'Old'
    else:
        return 'Unknown'

combined_data.obs['development_stage'] = combined_data.obs['age'].apply(age_to_group)

# Optionally convert to categorical with order:
combined_data.obs['development_stage'] = pd.Categorical(
    combined_data.obs['development_stage'],
    categories=['Prenatal', 'Childhood', 'Young Adult', 'Adult', 'Old', 'Unknown'],
    ordered=True
)

combined_data.obs['development_stage'].value_counts()

development_stage
Adult          85274
Young Adult    34086
Childhood          0
Prenatal           0
Old                0
Unknown            0
Name: count, dtype: int64

ref = pd.read_csv("../../../../Ensembl_symbol_Human_(GRCh38.p14)/mart_export.txt")  

ref

	HGNC symbol	Gene stable ID
0	MT-TF	ENSG00000210049
1	MT-RNR1	ENSG00000211459
2	MT-TV	ENSG00000210077
3	MT-RNR2	ENSG00000210082
4	MT-TL1	ENSG00000209082
...	...	...
86363	SNHG12	ENSG00000197989
86364	TAF12-DT	ENSG00000229388
86365	NaN	ENSG00000289291
86366	RNU11	ENSG00000274978
86367	NaN	ENSG00000296488

86368 rows × 2 columns

# Current gene symbols from var_names
gene_symbols = combined_data.var_names.to_list()

# Create mapping: symbol -> Ensembl ID
symbol_to_ensembl = dict(zip(ref['HGNC symbol'], ref['Gene stable ID']))

# Map gene symbols to Ensembl IDs (or None if not found)
ensembl_ids = [symbol_to_ensembl.get(gene, None) for gene in gene_symbols]

# Create a mask to keep only genes with Ensembl IDs starting with "ENSG"
keep_mask = [ensembl is not None and ensembl.startswith("ENSG") for ensembl in ensembl_ids]

# Filter var and var_names accordingly
combined_data = combined_data[:, keep_mask].copy()  # subset cells and genes

# Update var_names to Ensembl IDs for kept genes
combined_data.var['ensembl_id'] = [ensembl_ids[i] for i, keep in enumerate(keep_mask) if keep]
combined_data.var_names = combined_data.var['ensembl_id'].values

combined_data.var['gene_symbol'] = combined_data.var['gene_ids']

combined_data.X.max()

2672.0

if hasattr(combined_data.X, 'sum'):
    combined_data.obs['n_counts'] = np.ravel(combined_data.X.sum(axis=1))
else:
    combined_data.obs['n_counts'] = combined_data.X.sum(axis=1)

combined_data.obs['cell_type']

AAACCTGAGACAAAGG-1-0           NKT
AAACCTGAGCGTTCCG-1-0     CD8_naive
AAACCTGAGTGTTTGC-1-0    CD4_memory
AAACCTGCAAACTGTC-1-0           NKT
AAACCTGCACAACGTT-1-0     CD4_naive
                           ...    
TTTGGTTTCGGAATCT-1-2      Int.mono
TTTGTCAAGGCATGTG-1-2      Int.mono
TTTGTCACAAAGGAAG-1-2      CD14mono
TTTGTCATCAACACTG-1-2      CD14mono
TTTGTCATCACAACGT-1-2      CD14mono
Name: cell_type, Length: 119360, dtype: category
Categories (30, object): ['ASDC', 'B_naive', 'CD4_memory', 'CD4_naive', ..., 'plasma_IgG', 'plasma_IgM', 'plasma_dividing', 'switched memory']

combined_data.write_h5ad('./raw_data_LPS_Private_Haniffa_lab.h5ad') # save the processed data

1.3. Concatenation of public and preprocessed prived data

adata1 = sc.read('./Private_emily/raw_data_LPS_Private_Haniffa_lab.h5ad')

/nfs/users/nfs_a/am74/.cache/pypoetry/virtualenvs/cytomeister-AO4Zveyp-py3.10/lib/python3.10/site-packages/anndata/_core/anndata.py:1756: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
  utils.warn_names_duplicates("var")

adata2 = sc.read('./Public_Emily/Public_LPS_PP.h5ad')

# Make var_names unique to avoid conflicts during concatenation
adata2.var_names_make_unique()
adata1.var_names_make_unique()

# Concatenate the two datasets
adata = adata1.concatenate(adata2)

/tmp/ipykernel_983311/3094885911.py:1: FutureWarning: Use anndata.concat instead of AnnData.concatenate, AnnData.concatenate is deprecated and will be removed in the future. See the tutorial for concat at: https://anndata.readthedocs.io/en/latest/concatenation.html
  adata = adata1.concatenate(adata2)

adata # check the merged data

AnnData object with n_obs × n_vars = 224150 × 13826
    obs: 'patient_id', 'time_after_LPS', 'Sex', 'age', 'batch', 'IR_VJ_1_locus_tcr', 'IR_VJ_2_locus_tcr', 'IR_VDJ_1_locus_tcr', 'IR_VDJ_2_locus_tcr', 'IR_VJ_1_cdr3_tcr', 'IR_VJ_2_cdr3_tcr', 'IR_VDJ_1_cdr3_tcr', 'IR_VDJ_2_cdr3_tcr', 'IR_VJ_1_cdr3_nt_tcr', 'IR_VJ_2_cdr3_nt_tcr', 'IR_VDJ_1_cdr3_nt_tcr', 'IR_VDJ_2_cdr3_nt_tcr', 'IR_VJ_1_expr_tcr', 'IR_VJ_2_expr_tcr', 'IR_VDJ_1_expr_tcr', 'IR_VDJ_2_expr_tcr', 'IR_VJ_1_expr_raw_tcr', 'IR_VJ_2_expr_raw_tcr', 'IR_VDJ_1_expr_raw_tcr', 'IR_VDJ_2_expr_raw_tcr', 'IR_VJ_1_v_gene_tcr', 'IR_VJ_2_v_gene_tcr', 'IR_VDJ_1_v_gene_tcr', 'IR_VDJ_2_v_gene_tcr', 'IR_VJ_1_d_gene_tcr', 'IR_VJ_2_d_gene_tcr', 'IR_VDJ_1_d_gene_tcr', 'IR_VDJ_2_d_gene_tcr', 'IR_VJ_1_j_gene_tcr', 'IR_VJ_2_j_gene_tcr', 'IR_VDJ_1_j_gene_tcr', 'IR_VDJ_2_j_gene_tcr', 'IR_VJ_1_c_gene_tcr', 'IR_VJ_2_c_gene_tcr', 'IR_VDJ_1_c_gene_tcr', 'IR_VDJ_2_c_gene_tcr', 'IR_VJ_1_junction_ins_tcr', 'IR_VJ_2_junction_ins_tcr', 'IR_VDJ_1_junction_ins_tcr', 'IR_VDJ_2_junction_ins_tcr', 'has_ir_tcr', 'multi_chain_tcr', 'IR_VJ_1_locus_bcr', 'IR_VJ_2_locus_bcr', 'IR_VDJ_1_locus_bcr', 'IR_VDJ_2_locus_bcr', 'IR_VJ_1_cdr3_bcr', 'IR_VJ_2_cdr3_bcr', 'IR_VDJ_1_cdr3_bcr', 'IR_VDJ_2_cdr3_bcr', 'IR_VJ_1_cdr3_nt_bcr', 'IR_VJ_2_cdr3_nt_bcr', 'IR_VDJ_1_cdr3_nt_bcr', 'IR_VDJ_2_cdr3_nt_bcr', 'IR_VJ_1_expr_bcr', 'IR_VJ_2_expr_bcr', 'IR_VDJ_1_expr_bcr', 'IR_VDJ_2_expr_bcr', 'IR_VJ_1_expr_raw_bcr', 'IR_VJ_2_expr_raw_bcr', 'IR_VDJ_1_expr_raw_bcr', 'IR_VDJ_2_expr_raw_bcr', 'IR_VJ_1_v_gene_bcr', 'IR_VJ_2_v_gene_bcr', 'IR_VDJ_1_v_gene_bcr', 'IR_VDJ_2_v_gene_bcr', 'IR_VJ_1_d_gene_bcr', 'IR_VJ_2_d_gene_bcr', 'IR_VDJ_1_d_gene_bcr', 'IR_VDJ_2_d_gene_bcr', 'IR_VJ_1_j_gene_bcr', 'IR_VJ_2_j_gene_bcr', 'IR_VDJ_1_j_gene_bcr', 'IR_VDJ_2_j_gene_bcr', 'IR_VJ_1_c_gene_bcr', 'IR_VJ_2_c_gene_bcr', 'IR_VDJ_1_c_gene_bcr', 'IR_VDJ_2_c_gene_bcr', 'IR_VJ_1_junction_ins_bcr', 'IR_VJ_2_junction_ins_bcr', 'IR_VDJ_1_junction_ins_bcr', 'IR_VDJ_2_junction_ins_bcr', 'has_ir_bcr', 'multi_chain_bcr', 'scr_th', 'doublet_scores', 'cal_th', 'scr_predicted_doublets', 'cal_dobulets', 'data_batch', 'individual_id', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'cell_typist_full_predicted_labels', 'cell_typist_full_majority_voting', 'cell_typist_initial_predicted_labels', 'cell_typist_initial_majority_voting', 'final_anno', 'timeline', 'sample_id', 'all_T', 'final_anno1', 'all_B', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'log1p_total_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'n_counts', 'cell_type', 'donor_id', 'study', 'tissue', 'development_stage', 'full_clustering', 'initial_clustering', 'Resample', 'Collection_Day', 'Age_interval', 'Swab_result', 'Status', 'Smoker', 'Status_on_day_collection', 'Status_on_day_collection_summary', 'Days_from_onset', 'Site', 'Worst_Clinical_Status', 'Outcome', 'life_stage'
    var: 'gene_ids-0', 'mt-0', 'n_cells_by_counts-0', 'mean_counts-0', 'pct_dropout_by_counts-0', 'total_counts-0', 'GEX-0-0', 'GEX-2-0', 'ribo-0', 'hb-0', 'log1p_mean_counts-0', 'log1p_total_counts-0', 'n_cells-0', 'ensembl_id-0', 'gene_symbol-0', 'ensembl_id-1', 'gene_symbol-1', 'feature_types-1'

adata.obs['time_after_LPS'].value_counts() # check time points available

time_after_LPS
normal     148742
6h_LPS      43810
10h_LPS     20890
90m_LPS     10708
Name: count, dtype: int64

adata.obs['cell_type'].value_counts() # check cell types available

cell_type
CD14mono                     23888
CD4_naive                    18268
B_naive                      15093
CD4.Naive                    13829
NK_16hi                      13473
CD8_naive                    13249
CD4_memory                   10259
CD8_EM                        8869
gdT                           8273
CD8.Naive                     8105
CD4.CM                        7505
Int.mono                      7229
CD83_CD14_mono                6772
CD8.TE                        6501
CD4.IL22                      6379
MAIT                          6042
CD8.EM                        5734
NK_CD56dim                    4922
CD14mono_M1                   4129
CD14_mono                     3774
NKT                           3240
CD16_mono                     3007
Treg                          2457
Platelets                     2363
NK_56hi                       2190
DC2                           2024
non-swtiched memory           1583
DC3                           1463
B_switched_memory             1395
pDC                           1374
CD16mono                      1213
switched memory               1158
CD8_CM                        1004
B_immature                     966
Platelet                       862
NK_CD56bright                  856
CD4.Tfh                        800
B_non-switched_memory          647
B_exhausted                    334
non-switched memory/CD11c      301
RBC                            301
HSPC                           227
NK_prolif                      221
ILC1_3                         213
C1_CD16_mono                   198
CD4.EM                         188
Plasma_cell_IgA                173
Plasma_cell_IgG                168
DC1                            164
plasma_IgA                     160
CD4.Th1                        101
Plasmablast                     95
Plasma_cell_IgM                 84
HSC_erythroid                   74
CD8.Prolif                      70
HSC_CD38pos                     61
plasma_dividing                 50
plasma_IgG                      31
ASDC                            23
plasma_IgM                      18
Name: count, dtype: int64

# Define a mapping dictionary for renaming
cell_type_mapping = {
    # Monocytes
    "CD14mono": "CD14 monocytes",
    "CD14_mono": "CD14 monocytes",
    "CD83_CD14_mono": "CD14 monocytes",
    "CD14mono_M1": "CD14 monocytes",
    "CD16_mono": "CD16 monocytes",
    "CD16mono": "CD16 monocytes",
    "C1_CD16_mono": "CD16 monocytes",
    # CD4 T cells
    "CD4_naive": "CD4+ T cells",
    "CD4.Naive": "CD4+ T cells",
    "CD4_memory": "CD4+ T cells",
    "CD4.CM": "CD4+ T cells",
    "CD4.Tfh": "CD4+ T cells",
    "CD4.Th1": "CD4+ T cells",
    "CD4.IL22": "CD4+ T cells",
    "CD4.EM": "CD4+ T cells",
    # CD8 T cells
    "CD8_naive": "CD8+ T cells",
    "CD8.Naive": "CD8+ T cells",
    "CD8_EM": "CD8+ T cells",
    "CD8.EM": "CD8+ T cells",
    "CD8_CM": "CD8+ T cells",
    "CD8.TE": "CD8+ T cells",
    "CD8.Prolif": "CD8+ T cells",
    # NK cells
    "NK_16hi": "NK",
    "NK_56hi": "NK",
    "NK_CD56dim": "NK",
    "NK_CD56bright": "NK",
    "NK_prolif": "NK",
    # B cells
    "B_naive": "B cell",
    "B_switched_memory": "B cell",
    "non-swtiched memory" : "B cell",
    "switched memory": "B cell",
    "B_non-switched_memory": "B cell",
    "non-switched memory": "B cell",
    "non-switched memory/CD11c": "B cell",
    "B_immature": "B cell",
    "B_exhausted": "B cell",
    # Plasma cells
    "Plasma_cell_IgA": "plasma cell",
    "plasma_IgA": "plasma cell",
    "Plasma_cell_IgG": "plasma cell",
    "plasma_IgG": "plasma cell",
    "Plasma_cell_IgM": "plasma cell",
    "plasma_IgM": "plasma cell",
    "Plasmablast": "plasma cell",
    "plasma_dividing": "plasma cell",
    # Others
    "Platelets": "platelet",
    "Platelet": "platelet",
    "Treg": "regulatory T cell",
    "MAIT": "mucosal invariant T cell",
    "gdT": "gamma-delta T cell",
    "NKT": "NKT",
    "pDC": "Plasmocytoid dendritic cell",
    "DC1": "Dendritic cells",
    "DC2": "Dendritic cells",
    "DC3": "Dendritic cells",
    "ASDC": "ASDC",
    "Int.mono": "CD14 monocytes",
    "RBC": "Red_Blood_Cell",
    "HSPC": "hematopoietic stem cell",
    "HSC_erythroid": "HSC_Erythroid",
    "HSC_CD38pos": "HSC_CD38pos",
    "ILC1_3": "ILC1_3",
}

# Apply the mapping to adata.obs['cell_type']
adata.obs['cell_type_harmonized'] = adata.obs['cell_type'].map(cell_type_mapping).fillna(adata.obs['cell_type'])

# Verify the new counts
print(adata.obs['cell_type_harmonized'].value_counts())

cell_type_harmonized
CD4+ T cells                   57329
CD14 monocytes                 45792
CD8+ T cells                   43532
NK                             21662
B cell                         21477
gamma-delta T cell              8273
mucosal invariant T cell        6042
CD16 monocytes                  4418
Dendritic cells                 3651
NKT                             3240
platelet                        3225
regulatory T cell               2457
Plasmocytoid dendritic cell     1374
plasma cell                      779
Red_Blood_Cell                   301
hematopoietic stem cell          227
ILC1_3                           213
HSC_Erythroid                     74
HSC_CD38pos                       61
ASDC                              23
Name: count, dtype: int64

# Get unique timepoints
all_timepoints = adata.obs['time_after_LPS'].unique()

# Group by cell_type and get unique timepoints per cell type
cell_type_timepoints = (
    adata.obs
    .groupby('cell_type_harmonized')['time_after_LPS']
    .unique()
    .apply(list)
)

# Find cell types that do not appear in all timepoints
missing_in_any_timepoint = [
    ct for ct in cell_type_timepoints.index 
    if not set(all_timepoints).issubset(set(cell_type_timepoints[ct]))
]

print("Cell types not present in all timepoints:", missing_in_any_timepoint)

Cell types not present in all timepoints: ['ASDC', 'HSC_CD38pos', 'HSC_Erythroid', 'ILC1_3', 'Red_Blood_Cell']

adata = sc.read('./full_lps.h5ad')

cell_types_to_remove = ['ASDC', 'HSC_CD38pos', 'HSC_Erythroid', 'ILC1_3', 'Red_Blood_Cell']
keep_mask = ~adata.obs['cell_type_harmonized'].isin(cell_types_to_remove)
adata_filtered = adata[keep_mask, :].copy() 

adata_filtered.obs['cell_type_harmonized'].value_counts()

cell_type_harmonized
CD4+ T cells                   57329
CD14 monocytes                 45792
CD8+ T cells                   43532
NK                             21662
B cell                         21477
gamma-delta T cell              8273
mucosal invariant T cell        6042
CD16 monocytes                  4418
Dendritic cells                 3651
NKT                             3240
platelet                        3225
regulatory T cell               2457
Plasmocytoid dendritic cell     1374
plasma cell                      779
hematopoietic stem cell          227
Name: count, dtype: int64

len(set(adata_filtered.obs['cell_type_harmonized']))

adata_filtered.X.max()

np.float32(4066.0)

adata_filtered.write_h5ad('./full_lps.h5ad') # save the final processed data