# The packages below may be necessary for users to install according to the imports necessary in the subsequent cells.

#!pip install --upgrade pandas
#!pip install --upgrade --ignore-installed PyYAML
#!pip install --upgrade pip
#!pip install --upgrade gen3 --user --upgrade
#!pip install cdiserrors
#!pip install --upgrade pydicom


import pandas as pd
import sys, os, webbrowser
import gen3
import pydicom
import matplotlib.pyplot as plt

from gen3.submission import Gen3Submission
from gen3.auth import Gen3Auth
from gen3.index import Gen3Index
from gen3.query import Gen3Query


# Import some custom Python scripts from personal GitHub repo.
# Change these directory paths to reflect your local working directory.

home_dir = "/Users/christopher" 
demo_dir = "{}/Documents/Notes/MIDRC/tutorials".format(home_dir)

os.chdir(demo_dir)

os.system("wget https://raw.githubusercontent.com/cgmeyer/gen3sdk-python/master/expansion/expansion.py -O {}/expansion.py".format(demo_dir))
%run expansion.py


# Initiate instances of the Gen3 SDK Classes using credentials file for authentication.
# Change the directory path in "cred" to reflect the location of your credentials file.

api = "https://data.midrc.org"
cred = "{}/Downloads/midrc-credentials.json".format(home_dir)
auth = Gen3Auth(api, refresh_file=cred) # authentication class
sub = Gen3Submission(api, auth) # submission class
query = Gen3Query(auth) # query class
exp = Gen3Expansion(api,auth,sub) # class with some custom scripts
exp.get_project_ids()


#Function to sort subjects into various age groups
def age_group(agelist):
    min_age = min(agelist)
    groups = ["-20 yr", "21-30 yr", "31-40 yr", "41-50 yr", "51-60 yr", "61-70 yr", "71-80 yr", "81-90 yr", "90+ yr"]
    grouplist = []
    for i in agelist:
        if i <= 20:
            grouplist.append(groups[0])
        elif i <= 30:
            grouplist.append(groups[1])
        elif i <= 40:
            grouplist.append(groups[2])
        elif i <= 50:
            grouplist.append(groups[3])
        elif i <= 60:
            grouplist.append(groups[4])
        elif i <= 70:
            grouplist.append(groups[5])
        elif i <= 80:
            grouplist.append(groups[6])
        elif i <= 90:
            grouplist.append(groups[7])
        else:
            grouplist.append(groups[8])
            
    return grouplist

#Function to represent various demographics into a precent positivity statistic
def percent_representation(df, demographic_type, demographics):

    positive_df = df[df['covid19_positive'] == 'Yes']
    negative_df = df[df['covid19_positive'] == 'No']
    
    neg_percents = []
    pos_percents = []
    for demo in demographics:
        neg_percents.append(round(len(negative_df[negative_df[demographic_type] == demo])/len(negative_df), 4)*100)
        pos_percents.append(round(len(positive_df[positive_df[demographic_type] == demo])/len(positive_df), 4)*100)
        
    neg = pd.DataFrame()
    pos = pd.DataFrame()     
    
    neg[demographic_type] = demographics
    neg['Percent'] = neg_percents
    neg['COVID-19 Status'] = 'Negative'
    
    pos[demographic_type] = demographics
    pos['Percent'] = pos_percents
    pos['COVID-19 Status'] = 'Positive'
        
    return pd.concat([neg, pos])


#Using the Gen3 connection "sub" data for project R1 is downloaded and converted into a data frame
cases = sub.export_node(program='Open',project='R1',node_type='case',fileformat='tsv')
df = pd.read_csv(StringIO(cases), sep='\t', header=0)
df['zip'] = df['zip'].astype(str)
df['age_group'] = age_group(df['age_at_index'])

df.loc[df.race == 'Native Hawaiian or other Pacific Islander', 'race'] = 'Pacific Islander'
df.loc[df.race == 'American Indian or Alaskan Native', 'race'] = 'American Indian' 
df.loc[df.race == 'Black or African American', 'race'] = 'Black or A.A.' 
df = df[['covid19_positive', 'age_group', 'sex', 'ethnicity', 'race']]
df.head()


covid_breakdown = {'Number of COVID-19 positive subjects': len(df[df['covid19_positive'] == 'Yes']['covid19_positive']), 
                   'Number of COVID-19 negative subjects': len(df[df['covid19_positive'] == 'No']['covid19_positive']), }
print(covid_breakdown)

print("Positivity percentage = {}%".format(round(list(covid_breakdown.items())[0][1]/list(covid_breakdown.items())[1][1]*100,1)))


races = ['Black or A.A.', 
         'White', 
         'Asian', 
         'Pacific Islander', 
         'American Indian', 
         'Other', 
         'Not Reported']
plot_df = percent_representation(df, 'race', races)
X = np.arange(len(races))

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

ax.bar(X - 0.2, plot_df[plot_df['COVID-19 Status'] == 'Negative']['Percent'], color='b', width=0.4, label='Negative')
ax.bar(X + 0.2, plot_df[plot_df['COVID-19 Status'] == 'Positive']['Percent'], color='r', width=0.4, label='Positive')

ax.set_xticks(X)
ax.set_xticklabels(races, rotation=25)
ax.set_ylabel('Percent')
ax.set_xlabel('Race')
ax.set_title('Subject Representation By Race')

ax.legend()
plt.show()


ethnicities = ['Not Hispanic or Latino', 'Hispanic or Latino'] 

plot_df = percent_representation(df, 'ethnicity', ethnicities)
X = np.arange(len(ethnicities))

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

ax.bar(X - 0.2, plot_df[plot_df['COVID-19 Status'] == 'Negative']['Percent'], color='b', width=0.4, label='Negative')
ax.bar(X + 0.2, plot_df[plot_df['COVID-19 Status'] == 'Positive']['Percent'], color='r', width=0.4, label='Positive')

ax.set_xticks(X)
ax.set_xticklabels(ethnicities, rotation=25)
ax.set_ylabel('Percent')
ax.set_xlabel('Ethnicity')
ax.set_title('Subject Representation By Ethnicity')

ax.legend()
plt.show()


sexes = ['Male', 'Female'] 

plot_df = percent_representation(df, 'sex', sexes)
X = np.arange(len(sexes))

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

ax.bar(X - 0.2, plot_df[plot_df['COVID-19 Status'] == 'Negative']['Percent'], color='b', width=0.4, label='Negative')
ax.bar(X + 0.2, plot_df[plot_df['COVID-19 Status'] == 'Positive']['Percent'], color='r', width=0.4, label='Positive')

ax.set_xticks(X)
ax.set_xticklabels(sexes, rotation=25)
ax.set_ylabel('Percent')
ax.set_xlabel('Sex')
ax.set_title('Subject Representation By Sex')

ax.legend()
plt.show()


fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

ages = ["-20 yr", "21-30 yr", "31-40 yr", "41-50 yr", "51-60 yr", "61-70 yr", "71-80 yr", "81-90 yr", "90+ yr"]

plot_df = percent_representation(df, 'age_group', ages)
X=np.arange(9)

ax.bar(X - 0.2, 
       plot_df[plot_df['COVID-19 Status'] == 'Negative']['Percent'], color='b', width=0.4, label='Negative')

ax.bar(X + 0.2, 
       plot_df[plot_df['COVID-19 Status'] == 'Positive']['Percent'], color='r', width=0.4, label='Positive')
ax.set_xticks(X)
ax.set_xticklabels(ages, rotation=25)

ax.set_ylabel('Percent')
ax.set_xlabel('Age Group')

ax.set_title('Subject Representation By Age Group')

ax.legend()
plt.show()

MIDRC Open-R1 Clinical Data¶

Created By: J Montgomery Maxwell¶

Subjects' COVID-19 Status¶

Subject Distribution¶