cred = "/Users/christopher/Downloads/midrc-credentials.json" # location of your MIDRC credentials, downloaded from https://data.midrc.org/identity by clicking "Create API key" button and saving the credentials.json locally
api = "https://data.midrc.org" # The base URL of the data commons being queried. This shouldn't change for MIDRC.

## The packages below may be necessary for users to install according to the imports necessary in the subsequent cells.

#!pip install --upgrade pandas
#!pip install --upgrade --ignore-installed PyYAML
#!pip install --upgrade pip
#!pip install --upgrade gen3

## Import Python Packages and scripts

import os
import gen3

from gen3.auth import Gen3Auth
from gen3.query import Gen3Query

auth = Gen3Auth(api, refresh_file=cred) # authentication class
query = Gen3Query(auth) # query class

#### "case" query parameters
## In this example, we're going to filter our patient cohort by asking for Asian male patients between the age of 40 and 89 that tested positive for COVID-19.

## case demographic filters
sex = "Male"
min_age = 50
max_age = 89

#### "nested" filters, these are attributes from other nodes that are nested under the case node ("child nodes" of case in the data model: data.midrc.org/dd)
## medications (vaccine data)
medication_manufacturer = ["Pfizer","Moderna"] #,"Janssen","AstraZeneca","Sinopharm","Novavax"]

## measurements filters (COVID-19 test data)
test_method = ["RT-PCR"] #,"Rapid antigen test"]
test_result_text = ["Positive","Negative"]

## conditions filters (co-morbidities and long COVID)
condition_name = ["COVID-19","Post COVID-19 condition, unspecified"] #,"Pneumonia, organism unspecified"]

## procedures filters
procedure_name = ["Breathing Support"]

## Here is an example getting all the cases in a particular project between ages of 45 and 47
## the "fields" option defines what fields we want the query to return. If set to "None", returns all available fields.

cases = query.raw_data_download(
                    data_type="case",
#                    fields=["project_id","submitter_id"],
                    fields=None,
                    filter_object={
                        "AND": [
                            {"=": {"sex": sex}},
                            {">=": {"age_at_index": min_age}},
                            {"<=": {"age_at_index": max_age}},
                            {"nested": {"path": "medications", "IN": {"medication_manufacturer": medication_manufacturer}}},
                            {"nested": {"path": "measurements", "IN": {"test_method": test_method}}},
                            {"nested": {"path": "measurements", "IN": {"test_result_text": test_result_text}}},
                            {"nested": {"path": "conditions", "IN": {"condition_name": condition_name}}},
                            #{"nested": {"path": "procedures", "IN": {"procedure_name": procedure_name}}}, # adding too many filters returns no data
                        ]
                    },
                    sort_fields=[{"submitter_id": "asc"}]
                )

if len(cases) > 0 and "submitter_id" in cases[0]:
    case_ids = [i['submitter_id'] for i in cases] ## make a list of the case (patient) IDs returned
    print("Query returned {} case IDs.".format(len(cases)))
    print("Data is a list with rows like this:\n\t {}".format(cases[0:1]))
else:
    print("Your query returned no data! Please, check that query parameters are valid.")

## Look at one record returned by the query
# Note: the "object_id" field is a list of all file identifiers associated with the case
cases[0]

source_nodes = ["cr_series_file","dx_series_file","annotation_file","dicom_annotation_file"]
modality = ["SEG", "CR", "DX", ] # this is somewhat redundant with the above source_node filter, but added here for demonstration purposes

## Search for specific files associated with our cohort by adding "case_ids" as a filter
# * Note: "fields" is set to "None" in this query, which by default returns all the properties available
data_files = query.raw_data_download(
                    data_type="data_file",
                    fields=None,
                    filter_object={
                        "AND": [
                            {"IN": {"case_ids": case_ids}},
                            {"IN": {"source_node": source_nodes}},
                            {"IN": {"modality": modality}},
                        ]
                    },
                    sort_fields=[{"submitter_id": "asc"}]
                )

if len(data_files) > 0:
    object_ids = [i['object_id'] for i in data_files if 'object_id' in i] ## make a list of the file object_ids returned by our query
    print("Query returned {} data files with {} object_ids.".format(len(data_files),len(object_ids)))
    print("Data is a list with rows like this:\n\t {}".format(data_files[0:1]))
else:
    print("Your query returned no data! Please, check that query parameters are valid.")

## View the detailed data for the first file returned
data_files[0]

## Build a list 
object_ids = []
for data_file in data_files:
    if 'object_id' in data_file:
        object_id = data_file['object_id']
        object_ids.append(object_id)

object_id = object_ids[1]
print("The first object_id of {}: '{}'".format(len(object_ids),object_id))

## Make a new directory for downloaded files
os.system("mkdir -p downloads")

## Run the "gen3 drs-pull object" command to download a file
cmd = "gen3 --auth {} --endpoint data.midrc.org drs-pull object {} --output-dir downloads".format(cred,object_id)
os.system(cmd)

!find downloads -name "*dcm"

## Simple loop to download all files and keep track of success and failures
cred = "/Users/christopher/Downloads/midrc-credentials.json" # location of your MIDRC credentials, downloaded from https://data.midrc.org/identity by clicking "Create API key" button and saving the credentials.json locally
success,failure,other=[],[],[]
count,total = 0,len(object_ids)
for object_id in object_ids:
    count+=1
    cmd = "gen3 --auth {} --endpoint data.midrc.org drs-pull object {} --output-dir downloads".format(cred,object_id)
    stout = subprocess.run(cmd, shell=True, capture_output=True)
    print("Progress ({}/{}): {}".format(count,total,stout.stdout))
    if "failed" in str(stout.stdout):
        failure.append(object_id)
    elif "successfully" in str(stout.stdout):
        success.append(object_id)
    else:
        other.append(object_id)

!find downloads -name "*.dcm"

!find downloads -name "*.dcm" | wc -l

How to Access Files for Specific Case IDs¶

1) Set up Python environment¶

Set local variables¶

Install / Import Python Packages and Scripts¶

Initiate instances of the Gen3 SDK Classes using credentials file for authentication¶

2) Build a cohort of cases by running queries against MIDRC APIs¶

Set 'case' query parameters¶

3) Send another query to get data file details for our cohort / case ID¶

Set 'data_file' query parameters¶

4) Access data files using their object_id / data GUID (globally unique identifiers)¶

Parse the data_file query response to build a list of all `object_id`s returned for our cohort.¶

Use the Gen3 SDK command `gen3 drs-pull object` to download an individual file¶

Use a simple loop to download all the files¶

The End¶

How to Access Files for Specific Case IDs¶

1) Set up Python environment¶

Set local variables¶

Install / Import Python Packages and Scripts¶

Initiate instances of the Gen3 SDK Classes using credentials file for authentication¶

2) Build a cohort of cases by running queries against MIDRC APIs¶

Set 'case' query parameters¶

3) Send another query to get data file details for our cohort / case ID¶

Set 'data_file' query parameters¶

4) Access data files using their object_id / data GUID (globally unique identifiers)¶

Parse the data_file query response to build a list of all object_ids returned for our cohort.¶

Use the Gen3 SDK command gen3 drs-pull object to download an individual file¶

Use a simple loop to download all the files¶

The End¶

Parse the data_file query response to build a list of all `object_id`s returned for our cohort.¶

Use the Gen3 SDK command `gen3 drs-pull object` to download an individual file¶