import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import cptac
en = cptac.Ucec()

clinical_data = en.get_clinical('mssm')
print(clinical_data.columns.to_list()) # a ton of data columns

['tumor_code', 'discovery_study', 'type_of_analyzed_samples', 'confirmatory_study', 'type_of_analyzed_samples', 'age', 'sex', 'race', 'ethnicity', 'ethnicity_race_ancestry_identified', 'Inferred ancestry', 'collection_in_us', 'participant_country', 'maternal_grandmother_country', 'maternal_grandfather_country', 'paternal_grandmother_country', 'paternal_grandfather_country', 'deaf_or_difficulty_hearing', 'blind_or_difficulty_seeing', 'difficulty_concentrating_remembering_or_making_decisions', 'difficulty_walking_or_climbing_stairs', 'difficulty_dressing_or_bathing', 'difficulty_doing_errands', 'consent_form_signed', 'case_stopped', 'tumor_site', 'tumor_site_other', 'tumor_laterality', 'tumor_focality', 'tumor_size_cm', 'histologic_type', 'histologic_grade', 'tumor_necrosis', 'margin_status', 'ajcc_tnm_cancer_staging_edition_used', 'pathologic_staging_primary_tumor_pt', 'pathologic_staging_regional_lymph_nodes_pn', 'number_of_lymph_nodes_examined', 'number_of_lymph_nodes_positive_for_tumor_by_he_staining', 'clinical_staging_distant_metastasis_cm', 'pathologic_staging_distant_metastasis_pm', 'specify_distant_metastasis_documented_sites', 'residual_tumor', 'tumor_stage_pathological', 'paraneoplastic_syndrome_present', 'ancillary_studies_immunohistochemistry_performed', 'ancillary_studies_immunohistochemistry_type_and_result', 'ancillary_studies_other_testing_performed', 'ancillary_studies_other_testing_type_and_result', 'performance_status_assessment_ecog_performance_status_score', 'performance_status_assessment_karnofsky_performance_status_score', 'number_of_lymph_nodes_positive_for_tumor_by_ihc_staining', 'perineural_invasion', 'height_at_time_of_surgery_cm', 'weight_at_time_of_surgery_kg', 'bmi', 'history_of_cancer', 'alcohol_consumption', 'tobacco_smoking_history', 'age_at_which_the_participant_started_smoking', 'age_at_which_the_participant_stopped_smoking', 'on_the_days_participant_smoked_how_many_cigarettes_did_he_she_usually_smoke', 'number_of_pack_years_smoked', 'was_the_participant_exposed_to_secondhand_smoke', 'exposure_to_secondhand_smoke_in_household_during_participants_childhood', 'exposure_to_secondhand_smoke_in_participants_current_household', 'number_of_years_participant_has_consumed_more_than_2_drinks_per_day_for_men_and_more_than_1_drink_per_day_for_women', 'cancer_type', 'history_source', 'history_of_any_treatment', 'medical_record_documentation_of_this_history_of_cancer_and_treatment', 'medical_condition', 'history_of_treatment', 'history_source', 'medication_name_vitamins_supplements', 'history_source', 'blood_collection_minimum_required_blood_collected', 'blood_collection_number_of_blood_tubes_collected', 'tumor_tissue_collection_tumor_type', 'tumor_tissue_collection_number_of_tumor_segments_collected', 'tumor_tissue_collection_clamps_used', 'tumor_tissue_collection_frozen_with_oct', 'normal_adjacent_tissue_collection_number_of_normal_segments_collected', 'follow_up_period', 'is_this_patient_lost_to_follow-up', 'vital_status_at_date_of_last_contact', 'number_of_days_from_date_of_initial_pathologic_diagnosis_to_date_of_last_contact', 'number_of_days_from_date_of_initial_pathologic_diagnosis_to_date_of_death', 'cause_of_death', 'number_of_days_from_date_of_collection_to_date_of_last_contact', 'number_of_days_from_date_of_collection_to_date_of_death', 'adjuvant_post-operative_radiation_therapy', 'adjuvant_post-operative_pharmaceutical_therapy', 'adjuvant_post-operative_immunological_therapy', 'tumor_status_at_date_of_last_contact_or_death', 'measure_of_success_of_outcome_at_the_completion_of_initial_first_course_treatment', 'measure_of_success_of_outcome_at_date_of_last_contact_or_death', 'ecog_performance_status_score_at_date_of_last_contact_or_death', 'karnofsky_performance_status_score_at_date_of_last_contact_or_death', 'performance_status_scale_timing_at_date_of_last_contact_or_death', 'measure_of_success_of_outcome_at_first_NTE', 'ecog_performance_status_score_at_first_NTE', 'karnofsky_performance_status_score_at_first_NTE', 'performance_status_scale_timing_at_first_NTE', 'new_tumor_after_initial_treatment', 'number_of_days_from_date_of_initial_pathologic_diagnosis_to_date_of_new_tumor_event_after_initial_treatment', 'type_of_new_tumor', 'site_of_new_tumor', 'other_site_of_new_tumor', 'diagnostic_evidence_of_recurrence_or_relapse', 'additional_surgery_for_new_tumor_loco-regional', 'additional_surgery_for_new_tumor_metastasis', 'residual_tumor_after_surgery_for_new_tumor', 'additional_treatment_radiation_therapy_for_new_tumor', 'additional_treatment_pharmaceutical_therapy_for_new_tumor', 'additional_treatment_immuno_for_new_tumor', 'number_of_days_from_date_of_initial_pathologic_diagnosis_to_date_of_additional_surgery_for_new_tumor_event_loco-regional', 'number_of_days_from_date_of_initial_pathologic_diagnosis_to_date_of_additional_surgery_for_new_tumor_event_metastasis', 'Recurrence-free survival, days', 'Recurrence-free survival from collection, days', 'Recurrence status (1, yes; 0, no)', 'Overall survival, days', 'Overall survival from collection, days', 'Survival status (1, dead; 0, alive)']

clinical_data['tumor_stage_pathological'].unique()
clinical_data['tumor_stage_pathological'].value_counts()

tumor_stage_pathological
Stage I      76
Stage III    15
Stage II      9
Stage IV      3
Name: count, dtype: int64

print(f"The minimum tumor size is {clinical_data['tumor_size_cm'].min()}")
print(f"The maximum tumor size is {clinical_data['tumor_size_cm'].max()}")
print(f"The average tumor size is {clinical_data['tumor_size_cm'].mean()}")

The minimum tumor size is 0.0
The maximum tumor size is 13.5
The average tumor size is 3.8572815533980584

sns.displot(clinical_data, x="bmi", hue="tumor_stage_pathological", kind="kde", common_norm=False, bw_adjust=.7)
plt.show()

figoStrip = sns.boxplot(data=clinical_data, x="tumor_stage_pathological", y="bmi", showfliers=False)
figoStrip = sns.stripplot(data=clinical_data, x="tumor_stage_pathological", y="bmi", jitter=True, color=".3")
figoStrip.set(xlabel = "FIGO stage")
plt.show()

cancer_bmi = clinical_data[['history_of_cancer', 'bmi']]

sns.displot(cancer_bmi[cancer_bmi['history_of_cancer'] != "Unknown"], x="bmi", hue="history_of_cancer", kind="kde", common_norm=False)
plt.show()

diabetes_plot = sns.boxplot(data=cancer_bmi, x="history_of_cancer", y="bmi", showfliers=False)
diabetes_plot = sns.stripplot(data=cancer_bmi, x="history_of_cancer", y="bmi", jitter=True, color=".3")
plt.show()

cptac warning: Your version of cptac (1.5.1) is out-of-date. Latest is 1.5.0. Please run 'pip install --upgrade cptac' to update it. (C:\Users\sabme\anaconda3\lib\threading.py, line 910)

Use Case 2: Comparing Clinical Attributes¶

Step 1: Importing packages and setting up your notebook.¶

Step 2: Getting data¶

Step 3: Investigating the data¶

Step 4: Plot the data¶

Step 5: Plot other data¶