# Start by importing the cptac package
import cptac

# Create an endometrial data object, named 'en'
en = cptac.Ucec()

# List the available data sources
en.list_data_sources()

# Retrieve the transcriptomics data from bcm
bcm_data = en.get_transcriptomics('bcm')

# Display the first few rows of the dataframe
bcm_data.head()

# Joining two -omics dataframes together using multi_join
prot_and_tran = en.multi_join({"umich proteomics":'', "bcm transcriptomics":''})
prot_and_tran.head()

cptac warning: Your version of cptac (1.5.1) is out-of-date. Latest is 1.5.0. Please run 'pip install --upgrade cptac' to update it. (C:\Users\sabme\anaconda3\lib\threading.py, line 910)

# Using multi_join with specified columns
prot_and_tran_selected = en.multi_join({"umich proteomics":'ARF5', "bcm transcriptomics":'A1BG'})
prot_and_tran_selected.head()

# Join a metadata dataframe with an -omics dataframe
clin_and_tran = en.multi_join({"mssm clinical":'', "bcm transcriptomics":''})
clin_and_tran.head()

clin_and_tran = en.multi_join({"mssm clinical": ["age", "Overall survival, days"], "bcm transcriptomics": ["ZYX", 'ZZEF1']})
clin_and_tran.head()

clin_and_tran = en.multi_join({
    "mssm clinical": "",
    "bcm transcriptomics": '' # Note that by using an empty string or list as the value, we join the entire dataframe
})

clin_and_tran.head()

joining_dictionary = {"umich proteomics": "ARF5", "bcm transcriptomics": "A1BG", "mssm clinical": [], "washu somatic_mutation": []}
en.multi_join(joining_dictionary).head()

sample_type_and_discovery = en.multi_join({"mssm clinical": ['type_of_analyzed_samples', 'discovery_study']})
sample_type_and_discovery.head()

somatic_mutations = en.get_somatic_mutation('harmonized')
selected_prot_and_som_mut = en.join_omics_to_mutations(
    omics_name = "proteomics",
    mutations_genes = "SHANK2",
    omics_genes = ["ARF5", "M6PR"],
    omics_source = 'umich',
    mutations_source = 'harmonized')
selected_prot_and_som_mut.head(10)

cptac warning: In joining the somatic_mutation table, no mutations were found for the following samples, so they were filled with Wildtype_Tumor or Wildtype_Normal: 141 samples for the SHANK2 gene (C:\Users\sabme\anaconda3\lib\site-packages\cptac\cancers\cancer.py, line 325)

SHANK2_default_filter = en.multi_join({"umich proteomics": ["ARF5", "M6PR"],
                                     "harmonized somatic_mutation": "SHANK2"},
                                    mutations_filter=[])

SHANK2_simple_filter = en.multi_join({"umich proteomics": ["ARF5", "M6PR"],
                                    "harmonized somatic_mutation": "SHANK2"},
                                   mutations_filter=["Missense_Mutation"])

PTEN_complex_filter = en.multi_join({"umich proteomics": ["ARF5", "M6PR"],
                                    "harmonized somatic_mutation": "SHANK2"}, 
                                    mutations_filter=["p.R130Q", "Nonsense_Mutation"])

cptac warning: Unknown mutation type Intron. Assigned lowest priority in filtering. (C:\Users\sabme\anaconda3\lib\site-packages\cptac\cancers\cancer.py, line 525)
cptac warning: In joining the somatic_mutation table, no mutations were found for the following samples, so they were filled with Wildtype_Tumor or Wildtype_Normal: 141 samples for the SHANK2 gene (C:\Users\sabme\AppData\Local\Temp\ipykernel_2264\3972322211.py, line 1)
cptac warning: Unknown mutation type Intron. Assigned lowest priority in filtering. (C:\Users\sabme\anaconda3\lib\site-packages\cptac\cancers\cancer.py, line 525)
cptac warning: In joining the somatic_mutation table, no mutations were found for the following samples, so they were filled with Wildtype_Tumor or Wildtype_Normal: 141 samples for the SHANK2 gene (C:\Users\sabme\AppData\Local\Temp\ipykernel_2264\3972322211.py, line 5)
cptac warning: Filter value p.R130Q does not exist in the mutations data for the SHANK2 gene, though it exists for other genes. (C:\Users\sabme\anaconda3\lib\site-packages\cptac\cancers\cancer.py, line 525)
cptac warning: Unknown mutation type Intron. Assigned lowest priority in filtering. (C:\Users\sabme\anaconda3\lib\site-packages\cptac\cancers\cancer.py, line 525)
cptac warning: In joining the somatic_mutation table, no mutations were found for the following samples, so they were filled with Wildtype_Tumor or Wildtype_Normal: 141 samples for the SHANK2 gene (C:\Users\sabme\AppData\Local\Temp\ipykernel_2264\3972322211.py, line 9)

en.get_clinical('mssm')

en.join_metadata_to_mutations(
    metadata_name="clinical",
    metadata_source="mssm",
    metadata_cols=["age", "sex", "race"],
    mutations_source="harmonized",
    mutations_genes="SHANK2",
    mutations_filter=["Missense_Mutation"])

cptac warning: Unknown mutation type Intron. Assigned lowest priority in filtering. (C:\Users\sabme\anaconda3\lib\site-packages\cptac\cancers\cancer.py, line 525)
cptac warning: In joining the somatic_mutation table, no mutations were found for the following samples, so they were filled with Wildtype_Tumor or Wildtype_Normal: 92 samples for the SHANK2 gene (C:\Users\sabme\anaconda3\lib\site-packages\cptac\cancers\cancer.py, line 437)

en.multi_join({"mssm clinical": ["age", "sex", "race"],
               "harmonized somatic_mutation": ["SHANK2", "PTEN", "TP53"]})

cptac warning: In joining the somatic_mutation table, no mutations were found for the following samples, so they were filled with Wildtype_Tumor or Wildtype_Normal: 92 samples for the SHANK2 gene, 28 samples for the PTEN gene, 80 samples for the TP53 gene (C:\Users\sabme\AppData\Local\Temp\ipykernel_2264\3189298179.py, line 1)

survival_and_SHANK2 = en.multi_join({"mssm clinical": ["age", "sex", "race"],
               "harmonized somatic_mutation": ["SHANK2", "PTEN", "TP53"]}, 
               mutations_filter=["Missense_Mutation"])

survival_and_SHANK2

cptac warning: Unknown mutation type Intron. Assigned lowest priority in filtering. (C:\Users\sabme\anaconda3\lib\site-packages\cptac\cancers\cancer.py, line 525)
cptac warning: In joining the somatic_mutation table, no mutations were found for the following samples, so they were filled with Wildtype_Tumor or Wildtype_Normal: 92 samples for the SHANK2 gene, 28 samples for the PTEN gene, 80 samples for the TP53 gene (C:\Users\sabme\AppData\Local\Temp\ipykernel_2264\3101478147.py, line 1)

survival_and_SHANK2.to_csv(path_or_buf="histologic_type_and_PTEN_mutation.tsv", sep='\t')

	Data type	Available sources
0	CNV	[bcm, washu]
1	circular_RNA	[bcm]
2	miRNA	[bcm, washu]
3	proteomics	[bcm, umich]
4	transcriptomics	[bcm, broad, washu]
5	ancestry_prediction	[harmonized]
6	somatic_mutation	[harmonized, washu]
7	clinical	[mssm]
8	follow-up	[mssm]
9	medical_history	[mssm]
10	acetylproteomics	[umich]
11	phosphoproteomics	[umich]
12	cibersort	[washu]
13	hla_typing	[washu]
14	tumor_purity	[washu]
15	xcell	[washu]

Name	A1BG	A1BG-AS1	A1CF	A2M	A2M-AS1	A2ML1	A2ML1-AS1	A2ML1-AS2	A2MP1	A3GALT2	...	ZXDB	ZXDC	ZYG11A	ZYG11AP1	ZYG11B	ZYX	ZYXP1	ZZEF1	hsa-mir-1253	hsa-mir-423
Database_ID	ENSG00000121410.12	ENSG00000268895.6	ENSG00000148584.15	ENSG00000175899.15	ENSG00000245105.4	ENSG00000166535.20	ENSG00000256661.1	ENSG00000256904.1	ENSG00000256069.7	ENSG00000184389.9	...	ENSG00000198455.4	ENSG00000070476.15	ENSG00000203995.10	ENSG00000232242.2	ENSG00000162378.13	ENSG00000159840.16	ENSG00000274572.1	ENSG00000074755.15	ENSG00000272920.1	ENSG00000266919.3
Patient_ID
C3L-00006	2.54	5.11	3.60	13.75	6.45	7.08	1.80	0.00	2.60	1.16	...	10.17	10.61	5.54	0.0	11.85	10.60	0.0	11.87	0.0	0.0
C3L-00008	4.40	4.63	5.49	13.89	6.61	6.97	0.00	2.74	3.25	0.00	...	9.79	10.48	7.79	0.0	12.28	11.28	0.0	11.93	0.0	0.0
C3L-00032	4.83	7.26	3.73	14.48	6.91	9.56	0.98	0.00	3.26	0.00	...	9.43	9.97	6.48	0.0	11.72	10.37	0.0	11.70	0.0	0.0
C3L-00084	4.73	6.01	5.37	15.17	7.93	3.86	0.00	0.00	3.73	1.15	...	9.23	10.37	7.47	0.0	11.86	10.13	0.0	11.19	0.0	0.0
C3L-00090	4.14	6.24	5.69	13.87	6.79	4.32	0.00	0.00	3.23	0.00	...	9.69	9.64	7.60	0.0	11.98	10.31	0.0	11.45	0.0	0.0

Name	ARF5_umich_proteomics	M6PR_umich_proteomics	ESRRA_umich_proteomics	FKBP4_umich_proteomics	NDUFAF7_umich_proteomics	FUCA2_umich_proteomics	DBNDD1_umich_proteomics	SEMA3F_umich_proteomics	CFTR_umich_proteomics	CYP51A1_umich_proteomics	...	ZXDB_bcm_transcriptomics	ZXDC_bcm_transcriptomics	ZYG11A_bcm_transcriptomics	ZYG11AP1_bcm_transcriptomics	ZYG11B_bcm_transcriptomics	ZYX_bcm_transcriptomics	ZYXP1_bcm_transcriptomics	ZZEF1_bcm_transcriptomics	hsa-mir-1253_bcm_transcriptomics	hsa-mir-423_bcm_transcriptomics
Database_ID	ENSP00000000233.5	ENSP00000000412.3	ENSP00000000442.6	ENSP00000001008.4	ENSP00000002125.4	ENSP00000002165.5	ENSP00000002501.6	ENSP00000002829.3	ENSP00000003084.6	ENSP00000003100.8	...	ENSG00000198455.4	ENSG00000070476.15	ENSG00000203995.10	ENSG00000232242.2	ENSG00000162378.13	ENSG00000159840.16	ENSG00000274572.1	ENSG00000074755.15	ENSG00000272920.1	ENSG00000266919.3
Patient_ID
C3L-00006	-0.056513	0.016557	0.002569	0.389819	0.603610	-0.332543	-0.790426	NaN	0.822732	0.039134	...	10.17	10.61	5.54	0.0	11.85	10.60	0.0	11.87	0.0	0.0
C3L-00008	0.549959	-0.206129	0.905784	-0.303631	0.018767	0.503513	0.950955	0.080142	NaN	-0.063213	...	9.79	10.48	7.79	0.0	12.28	11.28	0.0	11.93	0.0	0.0
C3L-00032	0.088681	-0.154447	-0.190515	0.170753	0.196356	0.544194	-0.179078	NaN	NaN	0.377405	...	9.43	9.97	6.48	0.0	11.72	10.37	0.0	11.70	0.0	0.0
C3L-00084	-0.846555	0.027740	NaN	0.178700	0.264054	-0.183548	0.077215	-0.247164	0.152277	-0.279549	...	9.23	10.37	7.47	0.0	11.86	10.13	0.0	11.19	0.0	0.0
C3L-00090	0.539019	0.956619	-0.039516	0.323656	0.064605	0.173433	-0.524325	-0.038590	-0.311486	0.309905	...	9.69	9.64	7.60	0.0	11.98	10.31	0.0	11.45	0.0	0.0

Name	ARF5_umich_proteomics	A1BG_bcm_transcriptomics
Database_ID	ENSP00000000233.5	ENSG00000121410.12
Patient_ID
C3L-00006	-0.056513	2.54
C3L-00008	0.549959	4.40
C3L-00032	0.088681	4.83
C3L-00084	-0.846555	4.73
C3L-00090	0.539019	4.14

Name	tumor_code	discovery_study	type_of_analyzed_samples_mssm_clinical	confirmatory_study	type_of_analyzed_samples_mssm_clinical	age	sex	race	ethnicity	ethnicity_race_ancestry_identified	...	ZXDB_bcm_transcriptomics	ZXDC_bcm_transcriptomics	ZYG11A_bcm_transcriptomics	ZYG11AP1_bcm_transcriptomics	ZYG11B_bcm_transcriptomics	ZYX_bcm_transcriptomics	ZYXP1_bcm_transcriptomics	ZZEF1_bcm_transcriptomics	hsa-mir-1253_bcm_transcriptomics	hsa-mir-423_bcm_transcriptomics
Database_ID											...	ENSG00000198455.4	ENSG00000070476.15	ENSG00000203995.10	ENSG00000232242.2	ENSG00000162378.13	ENSG00000159840.16	ENSG00000274572.1	ENSG00000074755.15	ENSG00000272920.1	ENSG00000266919.3
Patient_ID
C3L-00006	UCEC	Yes	Tumor_and_Normal	NaN	NaN	64	Female	White	Not Hispanic or Latino	White	...	10.17	10.61	5.54	0.0	11.85	10.60	0.0	11.87	0.0	0.0
C3L-00008	UCEC	Yes	Tumor	NaN	NaN	58	Female	White	Not Hispanic or Latino	White	...	9.79	10.48	7.79	0.0	12.28	11.28	0.0	11.93	0.0	0.0
C3L-00032	UCEC	Yes	Tumor	NaN	NaN	50	Female	White	Not Hispanic or Latino	White	...	9.43	9.97	6.48	0.0	11.72	10.37	0.0	11.70	0.0	0.0
C3L-00084	UCEC	Yes	Tumor	NaN	NaN	74	Female	White	Not Hispanic or Latino	White	...	9.23	10.37	7.47	0.0	11.86	10.13	0.0	11.19	0.0	0.0
C3L-00090	UCEC	Yes	Tumor	NaN	NaN	75	Female	White	Not Hispanic or Latino	White	...	9.69	9.64	7.60	0.0	11.98	10.31	0.0	11.45	0.0	0.0

Tutorial 3: Joining dataframes with `cptac`¶

General format¶

Join dictionary¶

Join omics to omics¶

Join metadata to omics¶

Join metadata to metadata¶

Join many datatypes together¶

Join omics to mutations¶

Filtering multiple mutations¶

Join metadata to mutations¶

Exporting dataframes¶

Name	ARF5_umich_proteomics	M6PR_umich_proteomics	SHANK2_Mutation	SHANK2_Location	SHANK2_Mutation_Status	Sample_Status
Patient_ID
C3L-00006	-0.056513	0.016557	[Missense_Mutation]	[p.S1692R]	Single_mutation	Tumor
C3L-00008	0.549959	-0.206129	[Wildtype_Tumor]	[No_mutation]	Wildtype_Tumor	Tumor
C3L-00032	0.088681	-0.154447	[Wildtype_Tumor]	[No_mutation]	Wildtype_Tumor	Tumor
C3L-00084	-0.846555	0.027740	[Wildtype_Tumor]	[No_mutation]	Wildtype_Tumor	Tumor
C3L-00090	0.539019	0.956619	[Wildtype_Tumor]	[No_mutation]	Wildtype_Tumor	Tumor
C3L-00098	-0.017370	0.125574	[Wildtype_Tumor]	[No_mutation]	Wildtype_Tumor	Tumor
C3L-00136	0.230347	0.575436	[Wildtype_Tumor]	[No_mutation]	Wildtype_Tumor	Tumor
C3L-00137	0.191915	0.113577	[Wildtype_Tumor]	[No_mutation]	Wildtype_Tumor	Tumor
C3L-00139	-0.410142	0.381355	[Wildtype_Tumor]	[No_mutation]	Wildtype_Tumor	Tumor
C3L-00143	-0.170514	1.008577	[Wildtype_Tumor]	[No_mutation]	Wildtype_Tumor	Tumor

Name	age	sex	race	SHANK2_Mutation	SHANK2_Location	SHANK2_Mutation_Status	PTEN_Mutation	PTEN_Location	PTEN_Mutation_Status	TP53_Mutation	TP53_Location	TP53_Mutation_Status	Sample_Status
Patient_ID
C3L-00006	64	Female	White	[Missense_Mutation]	[p.S1692R]	Single_mutation	[Missense_Mutation, Nonsense_Mutation]	[p.R130Q, p.R233*]	Multiple_mutation	[Missense_Mutation]	[p.R248W]	Single_mutation	Tumor
C3L-00008	58	Female	White	[Wildtype_Tumor]	[No_mutation]	Wildtype_Tumor	[Missense_Mutation]	[p.G127R]	Single_mutation	[Wildtype_Tumor]	[No_mutation]	Wildtype_Tumor	Tumor
C3L-00032	50	Female	White	[Wildtype_Tumor]	[No_mutation]	Wildtype_Tumor	[Nonsense_Mutation]	[p.W111*]	Single_mutation	[Wildtype_Tumor]	[No_mutation]	Wildtype_Tumor	Tumor
C3L-00084	74	Female	White	[Wildtype_Tumor]	[No_mutation]	Wildtype_Tumor	[Wildtype_Tumor]	[No_mutation]	Wildtype_Tumor	[Wildtype_Tumor]	[No_mutation]	Wildtype_Tumor	Tumor
C3L-00090	75	Female	White	[Wildtype_Tumor]	[No_mutation]	Wildtype_Tumor	[Missense_Mutation]	[p.R130G]	Single_mutation	[Wildtype_Tumor]	[No_mutation]	Wildtype_Tumor	Tumor
...	...	...	...	...	...	...	...	...	...	...	...	...	...
C3N-01520	69	Female	Unknown	[Missense_Mutation]	[p.P1586S]	Single_mutation	[Frame_Shift_Del, Frame_Shift_Ins]	[p.N323fs, p.D268fs]	Multiple_mutation	[Wildtype_Tumor]	[No_mutation]	Wildtype_Tumor	Tumor
C3N-01521	75	Female	Unknown	[Wildtype_Tumor]	[No_mutation]	Wildtype_Tumor	[Wildtype_Tumor]	[No_mutation]	Wildtype_Tumor	[Missense_Mutation]	[p.H193L]	Single_mutation	Tumor
C3N-01537	74	Female	Unknown	[Wildtype_Tumor]	[No_mutation]	Wildtype_Tumor	[Wildtype_Tumor]	[No_mutation]	Wildtype_Tumor	[Wildtype_Tumor]	[No_mutation]	Wildtype_Tumor	Tumor
C3N-01802	85	Female	Black or African American	[Wildtype_Tumor]	[No_mutation]	Wildtype_Tumor	[Wildtype_Tumor]	[No_mutation]	Wildtype_Tumor	[Missense_Mutation]	[p.P27S]	Single_mutation	Tumor
C3N-01825	70	Female	Unknown	[Wildtype_Tumor]	[No_mutation]	Wildtype_Tumor	[Wildtype_Tumor]	[No_mutation]	Wildtype_Tumor	[Missense_Mutation]	[p.R175H]	Single_mutation	Tumor

Tutorial 3: Joining dataframes with cptac¶

General format¶

Join dictionary¶

Join omics to omics¶

Join metadata to omics¶

Join metadata to metadata¶

Join many datatypes together¶

Join omics to mutations¶

Filtering multiple mutations¶

Join metadata to mutations¶

Exporting dataframes¶

Tutorial 3: Joining dataframes with `cptac`¶