import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import gseapy as gp
from gseapy.plot import barplot, dotplot

import cptac
en = cptac.Ucec()

tumorProt = en.join_metadata_to_omics(
    metadata_name="clinical",
    metadata_source='mssm',
    metadata_cols='type_of_analyzed_samples',
    omics_name="proteomics",
    omics_source='umich'
)

# Drop the second instance of the duplicated column
tumorProt = tumorProt.loc[:,~tumorProt.columns.duplicated()]

# Retrieve boolean array of true values for tumor and normal samples
tumor_bool = tumorProt['type_of_analyzed_samples_mssm_clinical'] == "Tumor"
normal_bool = tumorProt['type_of_analyzed_samples_mssm_clinical'] != "Tumor"

# Use boolean array to select for appropriate patients
tumor = tumorProt[tumor_bool]
normal = tumorProt[normal_bool]

# Create array variables to hold the significant genes for each partition
tumor_genes = []
normal_genes = []

# Grab the genes of interest, ignoring the MSI column in the dataframe
genes = tumor.columns[1:]

# Correct alpha level for multiple testing by dividing the standard .05 by the number of genes to be analyzed
threshold = .05 / len(genes)

# Perform Welch's t-test (different variances) on each gene between the two groups
for gene in genes:
    tumor_gene_abundance = tumor[gene]
    normal_gene_abundance = normal[gene]
    
    if len(tumor_gene_abundance.shape) > 1 or len(normal_gene_abundance.shape) > 1:
        # take the mean across the columns
        tumor_gene_abundance = tumor_gene_abundance.mean(axis=1)
        normal_gene_abundance = normal_gene_abundance.mean(axis=1)
    
    pvalue = stats.ttest_ind(tumor_gene_abundance, normal_gene_abundance, equal_var=False, nan_policy='omit').pvalue
    # If the P-value is significant, determine which partition is more highly expressed
    if pvalue < threshold:
        if tumor_gene_abundance.mean() > normal_gene_abundance.mean():
            tumor_genes.append(gene.split("_")[0])
        elif normal_gene_abundance.mean() > tumor_gene_abundance.mean():
            normal_genes.append(gene.split("_")[0])

print("Proteomics Tumor Genes:", len(tumor_genes))
print("Proteomics Normal Genes:", len(normal_genes))

cptac warning: Your version of cptac (1.5.1) is out-of-date. Latest is 1.5.0. Please run 'pip install --upgrade cptac' to update it. (C:\Users\sabme\anaconda3\lib\threading.py, line 910)
C:\Users\sabme\anaconda3\lib\site-packages\scipy\stats\_stats_py.py:1103: RuntimeWarning: divide by zero encountered in true_divide
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
C:\Users\sabme\anaconda3\lib\site-packages\scipy\stats\_stats_py.py:1103: RuntimeWarning: invalid value encountered in double_scalars
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero

Proteomics Tumor Genes: 662
Proteomics Normal Genes: 921

tumor_enr = gp.enrichr(gene_list=tumor_genes, gene_sets='KEGG_2016', outdir='test/enrichr_kegg_tumor')
normal_enr = gp.enrichr(gene_list=normal_genes, gene_sets='KEGG_2016', outdir='test/enrichr_kegg_normal')

# View the data as a table
print(tumor_enr.res2d.head())
print(normal_enr.res2d.head())

    Gene_set                                               Term Overlap  \
0  KEGG_2016                   Proteasome Homo sapiens hsa03050   17/44   
1  KEGG_2016                     Lysosome Homo sapiens hsa04142  16/123   
2  KEGG_2016  Amino sugar and nucleotide sugar metabolism Ho...   10/48   
3  KEGG_2016                RNA transport Homo sapiens hsa03013  18/172   
4  KEGG_2016  Epstein-Barr virus infection Homo sapiens hsa0...  18/202   

        P-value  Adjusted P-value  Old P-value  Old Adjusted P-value  \
0  1.673514e-14      3.531115e-12            0                     0   
1  3.060119e-06      2.176135e-04            0                     0   
2  3.094031e-06      2.176135e-04            0                     0   
3  1.686874e-05      8.898263e-04            0                     0   
4  1.399243e-04      5.904807e-03            0                     0   

   Odds Ratio  Combined Score  \
0   18.850818      597.971810   
1    4.451492       56.520842   
2    7.789797       98.821639   
3    3.481810       38.265261   
4    2.909567       25.820683   

                                               Genes  
0  PSMD12;PSMD11;PSMD13;PSMD6;PSMC5;PSMD7;PSMD4;P...  
1  CTSA;MANBA;SLC11A2;CLTA;CTSS;SUMF1;AP1G2;AP1G1...  
2  TSTA3;HK3;CYB5R4;GALE;GMDS;UXS1;GFPT1;PMM2;GNP...  
3  EIF2B4;EIF5B;EIF2B3;EIF2B2;POP1;GEMIN2;POP4;RP...  
4  RB1;MAP2K3;HSPA8;PSMD12;PSMD11;PSMD13;PIK3CB;M...  
    Gene_set                                               Term Overlap  \
0  KEGG_2016  Complement and coagulation cascades Homo sapie...   29/79   
1  KEGG_2016               Focal adhesion Homo sapiens hsa04510  44/202   
2  KEGG_2016           Pathways in cancer Homo sapiens hsa05200  48/397   
3  KEGG_2016  Vascular smooth muscle contraction Homo sapien...  22/120   
4  KEGG_2016  Regulation of actin cytoskeleton Homo sapiens ...  28/214   

        P-value  Adjusted P-value  Old P-value  Old Adjusted P-value  \
0  4.096192e-19      1.011759e-16            0                     0   
1  3.485833e-18      4.305003e-16            0                     0   
2  1.023334e-09      8.425452e-08            0                     0   
3  2.620863e-08      1.618383e-06            0                     0   
4  6.380614e-07      3.034681e-05            0                     0   

   Odds Ratio  Combined Score  \
0   12.373117      523.866113   
1    6.008141      241.514180   
2    2.950797       61.082089   
3    4.739756       82.742763   
4    3.184891       45.431931   

                                               Genes  
0  CFD;CPB2;SERPINA1;CFH;C1S;PLAT;PLG;C8B;C8A;KNG...  
1  TNXB;LAMA4;PTEN;ILK;CRKL;PPP1CB;RAP1B;VTN;AKT3...  
2  CDKN1B;CTBP1;LAMA4;PTEN;RASGRP2;FGF2;FOXO1;GNA...  
3  GUCY1A2;PPP1R14A;RAMP2;CALCRL;ARHGEF12;PPP1R12...  
4  FGF2;CRKL;PPP1CB;RRAS;CFL2;GNA12;PIP4K2B;PIP5K...

barplot(tumor_enr.res2d, title="Proteomics Tumor KEGG_2016")
plt.show()

barplot(normal_enr.res2d, title="Normal KEGG_2016")
plt.show()

Use Case 5: Gene Set Enrichment Analysis¶

Step 1: Importing packages and setting up your notebook.¶

Step 2: Joining dataframes¶

Step 3: Data Segregation¶

Step 4: Performing Welch's t-test¶

Step 5: Gene set enrichment analysis (GSEA)¶

Step 6: Visualizing Enrichment Results¶

Discussion¶