{ "cells": [ { "cell_type": "markdown", "id": "a5f456d5", "metadata": {}, "source": [ "# 2. Tokenize transcriptomes and data pairing\n", "\n" ] }, { "cell_type": "markdown", "id": "854fe78c", "metadata": {}, "source": [ "This notebook starts with the CPU-only tokenization step. Replace placeholder paths like `path/to/...` with real locations on your system.\n" ] }, { "cell_type": "markdown", "id": "329a0780", "metadata": {}, "source": [ "## 2.1. Configure paths and parameters\n", "\n", "These parameters mirror the CLI options in `perturbgen.pp.GF_tokenisation`.\n" ] }, { "cell_type": "markdown", "id": "dd39c1c8", "metadata": {}, "source": [ "In order to download data, including the LPS data for this tutorial see https://perturbgen.cog.sanger.ac.uk/docs/data.html" ] }, { "cell_type": "code", "execution_count": null, "id": "bad16f62", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "download: s3://perturbgen/Manuscript/lps_otar.h5ad to ./lps_otar.h5adining remainingremainingremainingremainingremaining12.7 MiB/s) with 1 file(s) remaining 98.3 MiB/s) with 1 file(s) remaining \n" ] } ], "source": [ "# download LPS data from AWS S3 bucket\n", "!aws --endpoint-url https://cog.sanger.ac.uk --no-sign-request \\\n", " s3 cp s3://perturbgen/Manuscript/lps_otar.h5ad ./lps_otar.h5ad" ] }, { "cell_type": "code", "execution_count": null, "id": "8cb57beb", "metadata": {}, "outputs": [], "source": [ "H5AD_PATH = \"path/to/adata.h5ad\" #\"path/to/data.h5ad\"\n", "DATASET_NAME = \"LPS_all_tps_2k\" # choose a name for the dataset\n", "GENE_FILTERING_MODE = \"hvg\" # one of: hvg, degs, all\n", "HVG_MODE = \"before_tokenisation\" # before_tokenisation or after_tokenisation\n", "\n", "VAR_LIST = [\n", " \"cell_type_harmonized\",\n", " \"time_after_LPS\",\n", "] # list of obs to retain in adata.vars after preprocessing\n", "\n", "PAIRING_MODE = \"stratified\" # stratified, random, mapping. We select the pairing strategy here, for more info please read the paper.\n", "TIME_OBS = \"time_after_LPS\" # the obs that contains the time point information for pairing\n", "PAIRING_FILE = \"path/to/pairing.csv\" # only if PAIRING_MODE == 'mapping'\n", "MAIN_PAIRING_OBS = \"cell_type_harmonized\" # the main obs to use for pairing amongst TIME_OBS\n", "OPT_PAIRING_OBS = [] # optional additional obs\n", "\n", "NPROC = 8 # number of parallel processes to use\n", "N_HVG = 2000 # number of highly variable genes to select if HVG filtering is used\n", "TIME_POINT_ORDER = [\"normal\", \"90m_LPS\", \"6h_LPS\", \"10h_LPS\"]\n", "REFERENCE_TIME = \"normal\" # the reference time point for pairing, usually the control or untreated condition\n", "\n", "GENE_MEDIAN_PATH = \"Perturbgen/perturbgen/pp/gene_median_dict_gftokens_gc95M.pkl\" # path to gene median dictionary, provided with Perturbgen for pretrained model\n", "TOKEN_DICT_PATH = \"Perturbgen/perturbgen/pp/token_dict_gftokens_gc95M.pkl\" # path to token dictionary, provided with Perturbgen for pretrained model\n", "GENE_MAPPING_PATH = \"Perturbgen/perturbgen/pp/ensembl_mapping_dict_gc95M.pkl\" # path to gene mapping dictionary, Geneformer 95M mapping dictionary provided with Perturbgen\n" ] }, { "cell_type": "markdown", "id": "a33eaead", "metadata": {}, "source": [ "## 2.2. Build the tokenization command\n", "\n", "This prints the exact command that will be executed. Review it before running.\n" ] }, { "cell_type": "code", "execution_count": 33, "id": "6ffda7aa", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "python -m perturbgen tokenise --h5ad_path /nfs/team361/am74/Cytomeister/Evaluation_datasets/LPS/lps_otar.h5ad --dataset LPS_all_tps_2k --gene_filtering_mode hvg --hvg_mode before_tokenisation --var_list cell_type_harmonized time_after_LPS --pairing_mode stratified --time_obs time_after_LPS --main_pairing_obs cell_type_harmonized --nproc 8 --n_hvg 2000 --reference_time normal --time_point_order normal 90m_LPS 6h_LPS 10h_LPS --gene_median_path Perturbgen/perturbgen/pp/gene_median_dict_gftokens_gc95M.pkl --token_dict_path Perturbgen/perturbgen/pp/token_dict_gftokens_gc95M.pkl --gene_mapping_path Perturbgen/perturbgen/pp/ensembl_mapping_dict_gc95M.pkl\n" ] } ], "source": [ "cmd = [\n", " \"python\",\n", " \"-m\",\n", " \"perturbgen\",\n", " \"tokenise\",\n", " \"--h5ad_path\", H5AD_PATH,\n", " \"--dataset\", DATASET_NAME,\n", " \"--gene_filtering_mode\", GENE_FILTERING_MODE,\n", " \"--hvg_mode\", HVG_MODE,\n", " \"--var_list\", *VAR_LIST,\n", " \"--pairing_mode\", PAIRING_MODE,\n", " \"--time_obs\", TIME_OBS,\n", " \"--main_pairing_obs\", MAIN_PAIRING_OBS,\n", " \"--nproc\", str(NPROC),\n", " \"--n_hvg\", str(N_HVG),\n", " \"--reference_time\", REFERENCE_TIME,\n", " \"--time_point_order\", *TIME_POINT_ORDER,\n", " \"--gene_median_path\", GENE_MEDIAN_PATH,\n", " \"--token_dict_path\", TOKEN_DICT_PATH,\n", " \"--gene_mapping_path\", GENE_MAPPING_PATH,\n", "]\n", "\n", "if PAIRING_MODE == \"mapping\":\n", " cmd += [\"--pairing_file\", PAIRING_FILE]\n", "if OPT_PAIRING_OBS:\n", " cmd += [\"--opt_pairing_obs\", *OPT_PAIRING_OBS]\n", "\n", "print(\" \".join(cmd))\n" ] }, { "cell_type": "markdown", "id": "09e33ee3", "metadata": {}, "source": [ "## 2.3. Run tokenization (CPU-only)\n", "\n", "This step can take time depending on dataset size.\n" ] }, { "cell_type": "code", "execution_count": null, "id": "c9011d65", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "loading, please wait...\n", "Current working directory: /lustre/scratch126/cellgen/lotfollahi/dv8/for_otar\n", "Start preprocessing adata...\n", "Number of genes dropped: 0\n", "Finished preprocessing adata.\n", "Start tokenisation of adata...\n", "Tokenizing /lustre/scratch126/cellgen/lotfollahi/dv8/for_otar/T_perturb/tokenized_data/LPS_all_tps_2k/h5ad_pairing_2000_hvg/LPS_all_tps_2k.h5ad\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/lustre/scratch126/cellgen/lotfollahi/dv8/for_otar/Perturbgen/perturbgen/pp/tokenizer.py:495: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", " for i in adata.var[\"ensembl_id_collapsed\"][coding_miRNA_loc]\n", "/lustre/scratch126/cellgen/lotfollahi/dv8/for_otar/Perturbgen/perturbgen/pp/tokenizer.py:498: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", " coding_miRNA_ids = adata.var[\"ensembl_id_collapsed\"][coding_miRNA_loc]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "/lustre/scratch126/cellgen/lotfollahi/dv8/for_otar/T_perturb/tokenized_data/LPS_all_tps_2k/h5ad_pairing_2000_hvg/LPS_all_tps_2k.h5ad has no column attribute 'filter_pass'; tokenizing all cells.\n", "Creating dataset.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Map (num_proc=4): 100%|██████████| 223478/223478 [00:29<00:00, 7559.59 examples/s] \n", "Saving the dataset (1/1 shards): 100%|██████████| 223478/223478 [00:00<00:00, 458199.76 examples/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Finished tokenisation.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/lustre/scratch126/cellgen/lotfollahi/dv8/for_otar/Perturbgen/perturbgen/src/utils.py:1643: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n", " adata_obs_.groupby(grouping_obs)[time_obs].transform('nunique') == total_tps\n", "/lustre/scratch126/cellgen/lotfollahi/dv8/for_otar/Perturbgen/perturbgen/src/utils.py:1646: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n", " grouped = adata_grouped.groupby(grouping_obs)\n", "Map (num_proc=4): 100%|██████████| 223478/223478 [01:33<00:00, 2399.51 examples/s]\n", " 0%| | 0/4 [00:00