DEVCOMSuitepython Pipeline: Gene Importance, Causal Signal Flow and SURD

Data

Data source: Google Drive folder

All paths below are given relative to your project root. Adjust them to match your own directory structure if needed.

1. Gene Importance Core

import scanpy as sc
from DEVCOMSuitepython.CCI_importance_core import compute_gene_importance_all

cell_types = ["Endothelial", "Erythrocyte"]

res = compute_gene_importance_all(
    cell_types=cell_types,
    input_dir="./DEVCOM Suite test/tests/testthat/gene importance/",
    output_dir="./DEVCOM Suite test/tests/testthat/gene importance/",
    expr_suffix="_gene_expression.csv",
    threshold_pcc=0.8,
    threshold_p_value=0.05,
    lambda_factor=0.1,
    n_jobs=None,   # None = auto
    verbose=True
)

Example console output

开始处理所有细胞类型: ['Endothelial', 'Erythrocyte']
使用进程数: 2
[Endothelial] 读取表达矩阵: /home/wgh/DEVCOM Suite test/tests/testthat/gene importance/Endothelial_gene_expression.csv
[Erythrocyte] 读取表达矩阵: /home/wgh/DEVCOM Suite test/tests/testthat/gene importance/Erythrocyte_gene_expression.csv

[Endothelial] 过滤所有样本表达量均为 0 的基因...
[Endothelial] 开始计算 PCC 和 p 值...
[Erythrocyte] 过滤所有样本表达量均为 0 的基因...
[Erythrocyte] 开始计算 PCC 和 p 值...
[Erythrocyte] 开始进行多重检验校正 (FDR-BH)...
[Erythrocyte] 开始构建基因网络...
[Erythrocyte] 开始计算基因重要性得分...
[Endothelial] 开始进行多重检验校正 (FDR-BH)...
[Endothelial] 开始构建基因网络...
[Erythrocyte] 基因重要性得分已保存为: /home/wgh/DEVCOM Suite test/tests/testthat/gene importance/Erythrocyte_gene_importance_scores.csv
[Endothelial] 开始计算基因重要性得分...
[Endothelial] 基因重要性得分已保存为: /home/wgh/DEVCOM Suite test/tests/testthat/gene importance/Endothelial_gene_importance_scores.csv

2. Causal Signal-Flow Inference (FlowSig)

import scanpy as sc
from DEVCOMSuitepython.flowsig_prior_from import run_flowsig_pipeline

adata = sc.read("/home/wgh/DEVCOM Suite test/tests/testthat/flow/burkhardt21_merged.h5ad")

run_flowsig_pipeline(
    adata,
    prior_csv       = "./DEVCOM Suite test/tests/testthat/flow/human_L-R-TF_develop_filtered_LR_TF_top5000.csv",
    condition_key   = "Condition",
    control_name    = "Ctrl",
    construct_gems  = True,
    n_gems          = 10,
    counts_layer    = "counts",
    inflow_construction = "v1",
    edge_threshold  = 0.7,
    save_h5ad       = "./DEVCOM Suite test/tests/testthat/flow/burkhardt21_merged_flow_learned.h5ad"
)

Example console output

Removing 37 genes not expressing in Ctrl.
100%|██████████| 30/30 [00:59<00:00,  1.97s/it]
... storing 'Type' as categorical
... storing 'Downstream_TF' as categorical
... storing 'Type' as categorical
... storing 'Downstream_TF' as categorical
Starting 50 bootstraps on 1 cores …
100%|██████████| 50/50 [23:58<00:00, 28.78s/it]
Finished in 1,439.0 s
AnnData object with n_obs × n_vars = 5305 × 18027
    obs: 'sample_labels', 'Donor', 'Condition', 'library_size', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'doublet_score', 'predicted_doublet', 'n_counts', 'log_counts', 'n_genes', 'leiden', 'Type'
    var: 'gene_symbols', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection'
    uns: 'Condition', 'Condition_colors', 'Donor_colors', 'NMF_10', 'NMF_CV', 'Type', 'base_networks', 'base_networks_leiden', 'causal_networks', 'causal_networks_leiden', 'cellchat_output', 'cpdb_Type', 'flowsig_network', 'flowsig_network_cpdb', 'flowsig_network_cpdb_orig', 'flowsig_network_orig', 'hvg', 'learned_networks', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'pyliger', 'pyliger_10', 'pyliger_11', 'pyliger_12', 'pyliger_15', 'pyliger_20', 'pyliger_3', 'pyliger_5', 'pyliger_8', 'pyliger_9', 'pyliger_info', 'pyliger_vars', 'sample_labels_colors', 'scrublet', 'umap'
    obsm: 'X_SC', 'X_celltype_ligand', 'X_celltype_ligand_leiden', 'X_flow', 'X_flow_cpdb', 'X_flow_cpdb_orig', 'X_flow_orig', 'X_gem', 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'counts', 'normalized'
    obsp: 'connectivities', 'distances'

3. Interpretable Causal Signal Flow with SURD

3.1 Running SURD Segments

import scanpy as sc

# from surd_segments_api import load_surd_from_dir
# utils file
# load_surd_from_dir("/home/wgh/SOUD/SURD-main/utils")

from DEVCOMSuitepython.utils.surd import surd
from DEVCOMSuitepython.surd_segments import run_surd_segments
import DEVCOMSuitepython.surd_segments as ss

ss.surd = surd

adata = sc.read("/home/wgh/DEVCOM Suite test/tests/testthat/SURD/burkhardt21_merged_flow_learned.h5ad")

res = run_surd_segments(
    adata,
    ctrl_name  = "Ctrl",   # condition control
    treat_name = "IFNg",   # condition treat
    out_dir    = "./",
    fast_mode  = True
)

Example console output

[SURD] condition_key = Condition
[SURD] condition value_counts = {'IFNg': 3129, 'Ctrl': 2176}
[SURD] celltype_key = Type, n_types = 3
[SURD] picked adjacency from uns path: flowsig_network.network.adjacency_validated, shape=(236, 236)
[SURD] matched_groups = 3, total_treat_cells = 3129
[SURD] inflows=180, gems=10, outflows=46
[SURD] graph_edges=734, inflow->GEM edges=247, GEM->outflow edges=161

4. KNN Sweep for SURD (K Selection)

import os
import scanpy as sc

from DEVCOMSuitepython.utils.surd import surd
from DEVCOMSuitepython.surd_segments import run_surd_k_sweep
import DEVCOMSuitepython.surd_segments as ss

ss.surd = surd

adata = sc.read("./DEVCOM Suite test/tests/testthat/SURD/burkhardt21_merged_flow_learned.h5ad")

# 2) Path settings
DATA_DIR = "./DEVCOM Suite test/tests/testthat/SURD/"

H5AD_OUT = os.path.join(
    DATA_DIR,
    "burkhardt21_merged_SURD.h5ad"
)

# Output directory of the main SURD pipeline
# (should already contain gem2out_unique_synergy_redundancy.csv)
SEGMENTS_DIR = os.path.join(
    DATA_DIR,
    "surd_segments_results"
)

# IMPORTANT: only join the file name here, do NOT prepend an absolute path again
parent_csv_path = os.path.join(
    SEGMENTS_DIR,
    "gem2out_unique_synergy_redundancy.csv"
)

# 3) Load h5ad
adata = sc.read(H5AD_OUT)

# 4) Run K sweep (post-hoc)
k_df, best_row = run_surd_k_sweep(
    adata,
    ctrl_name="Ctrl",
    treat_name="IFNg",
    parent_csv_path=parent_csv_path,
    out_dir=SEGMENTS_DIR,          # K sweep results will also be written here
    k_range=range(1, 21),
    top_parents_per_target=8,
    n_perm_small=50,
    nbins_target=16,
    min_bins_per_dim=6,
    max_total_bins=20_000_000,
    discretize="quantile",
    min_samples_block=100,
    make_plots=True,
)

print("K sweep finished. Results written to:",
      os.path.join(SEGMENTS_DIR, "k_sweep_summary.csv"))

if best_row is not None:
    print("Best K:", int(best_row["K"]), "score =", float(best_row["score"]))
else:
    print("No valid blocks found, k_df is empty. "
          "Please check the parent CSV or condition column.")

Example console output

[load_surd_from_dir] 已从 /home/wgh/SOUD/SURD-main/utils 导入 surd()
K 扫描完成：结果写入： /home/wgh/DEVCOM Suite test/tests/testthat/SURD/surd_segments_results/k_sweep_summary.csv
最佳 K: 9 score = 0.9969896487413793