290 likes | 415 Vues
This guide details how to create custom syndrome definitions using R and RStudio by leveraging data from Tarrant County's healthcare database. It provides step-by-step SQL queries to analyze the co-occurrence of conditions like asthma and influenza-like illness. The guide includes instructions on retrieving data, processing it, creating visualizations with ggplot, and generating summary reports. Useful for data scientists looking to explore public health data and model syndromes relevant to time and location.
E N D
Creating Syndrome Definitions Using RStudio Tim Hopper Data Scientist RTI International
Code Is Available Online https://gist.github.com/tdhopper/d5939aaf74886143224e/raw/3ae883a25ef078a5edd2fcced0f0268b34be3d6b/Custom+Syndromes
Setup # Connect to TarrantCounty_FP database # Credentials USERNAME <- 'username' PASSWORD <- 'password' HOSTNAME <- 'data3.biosen.se' DBNAME <- 'TarrantCounty_FP' TABLE <- 'TC_Meaningful_Use_Base' # Create database connection con <- dbConnect(dbDriver('MySQL'), user=USERNAME, password=PASSWORD, host=HOSTNAME, dbname=DBNAME)
Example: Co-morbid Syndrome • We want to see the co-occurrence of influenza (influenza-like illness) and asthma. • Data source: Texas region 2/3 • Location: Tarrant County • Time: February 1–October 31, 2013
Query for Asthma SELECT Facility_City, Facility_State, Diagnosis_Code, Diagnosis_Text, Chief_Complaint, Age, Gender, Visit_Date_Time, Row_Number FROM TC_Meaningful_Use_Base WHERE Visit_Date_Time BETWEEN '2013-02-01 00:00:00' AND '2013-10-31 23:59:59' AND (Diagnosis_Code LIKE '%493%')
Query for Influenza-Like Illness SELECT Facility_City, Facility_State, Diagnosis_Code, Diagnosis_Text, Chief_Complaint, Age, Gender, Visit_Date_Time, Row_Number FROM TC_Meaningful_Use_Base WHERE Visit_Date_Time BETWEEN '2013-02-01 00:00:00' AND '2013-10-31 23:59:59' AND (Diagnosis_Code LIKE '%487%' OR Diagnosis_Code LIKE '%488%' OR Diagnosis_Code LIKE '%V04.8%' OR Diagnosis_Code LIKE '%V0481%' OR Diagnosis_Code LIKE '%V06.6%' OR Diagnosis_Code LIKE '%V066%')
Run Query and Process Data # Run Query df.asthma <- dbGetQuery(con, query.asthma) df.ili <- dbGetQuery(con, query.ili) # Add column naming each as a syndrome df.asthma$Syndrome <- 'ASTHMA' df.ili$Syndrome <- 'ILI' # Combine these two data sets into one data.frame df <- rbind(df.asthma, df.ili) # Format dates and add date column (without time) df$Visit_Date_Time <- ymd_hms(df$Visit_Date_Time) df$Visit_Date <- as.Date(df$Visit_Date_Time)
Create Summary Data Set events.per.day.split <- ddply(df, .(Visit_Date, Syndrome), summarize, Number_of_Visits=length(Visit_Date)) ############################################ # Visit_Date Syndrome Number_of_Visits # 1 2013-02-01 ASTHMA 49 # 2 2013-02-01 ASTHMA & ILI 2 # 3 2013-02-01 ILI 5 # 4 2013-02-02 ASTHMA 60 # 5 2013-02-02 ILI 21 # 6 2013-02-03 ASTHMA 89 ############################################
Visits Per Day by Syndrome ggplot(events.per.day.split) + aes(Visit_Date, Number_of_Visits, color=Syndrome) + geom_line()
Create Summary Data Set events.per.day <- ddply(df, .(Visit_Date), summarize, Number_of_Visits=length(Visit_Date)) ############################### # Visit_Date Number_of_Visits # 1 2013-02-01 513 # 2 2013-02-02 396 # 3 2013-02-03 428 # 4 2013-02-04 409 # 5 2013-02-05 580 # 6 2013-02-06 391 ###############################
Visits Per Day by Syndrome ggplot(events.per.day) + aes(Visit_Date, Number_of_Visits) + geom_line()
Example: New Syndrome • We want to see create a new syndrome to identify visits during which the patient had cough AND dizziness AND headache. • Data source: Texas region 2/3 • Location: Tarrant County • Time: February 1–October 31, 2013
Query SELECT Facility_City, Facility_State, Diagnosis_Code, Diagnosis_Text, Chief_Complaint, Age, Gender, Visit_Date_Time, Row_Number FROM TC_Meaningful_Use_Base WHERE Visit_Date_Time BETWEEN '2013-02-01 00:00:00' AND '2013-10-31 23:59:59' AND (Diagnosis_Code LIKE '%786.2%' OR Diagnosis_Code LIKE '%7862%') AND (Diagnosis_Code LIKE '%780.4%' OR Diagnosis_Code LIKE '%7804%') AND (Diagnosis_Code LIKE '%784.0%' OR Diagnosis_Code LIKE '%7840%');
Run Query # Run Query df.sick <- dbGetQuery(con, query) # Fix dates using lubridate df.sick$Visit_Date_Time <- ymd_hms(df.sick$Visit_Date_Time) # Create a month column df.sick$Month <- month(df.sick$Visit_Date, label=T)
Run Query ggplot(df.sick) + aes(Month) + geom_histogram()
Create Line Listing write.csv(df.sick, 'sick.csv', quote=F, row.names=F) # sick.csv: # # Row_Number,Facility_City,Facility_State,Diagnosis_Code,D... # 1374852,Houston,TX,473.9:780.4:300.00:786.2:784.0:305.1:... # 1536525,Houston,TX,486:786.2:780.4:784.0:794.00:789.00:7... # 2100347,Rowlett,TX,780.4:784.0:786.2,NA,SCREENING - HA -... # 2189305,Rowlett,TX,780.4:784.0:786.2:V76.12,NA,SCREENING... # 3108090,Rowlett,TX,780.4:784.0:786.2:V76.12,NA,SCREENING... # 5887191,Rowlett,TX,786.2:780.1:780.4:784.0,NA,786.2:SEP:... # 7968958,Houston,TX,493.90:780.4:780.60:784.0:786.2:787.0... # 9197758,Houston,TX,493.90:780.4:780.60:784.0:786.2:787.0...
Example: Refined Age Groups • We want to see motor vehicle traffic accidents involving young people. We recombine the ages to the following groups: 0–15, 16–20, 21–25, 26–30, and 31–35 years. • Data source: Texas region 2/3 • Location: Tarrant County • Time: February 1–October 31, 2013
Query SELECT Facility_City, Facility_State, Diagnosis_Code, Diagnosis_Text, Chief_Complaint, Age, Gender, Visit_Date_Time, Row_Number FROM TC_Meaningful_Use_Base WHERE Visit_Date_Time BETWEEN '2013-02-01 00:00:00' AND '2013-10-31 23:59:59' AND (Diagnosis_Code LIKE '%E81_%') AND Age <= 35;
Run Query # Run Query df.auto <- dbGetQuery(con, query) # Fix dates using lubridate df.auto$Visit_Date_Time <- ymd_hms(df.auto$Visit_Date_Time) # Create a date column df.auto$Visit_Date <- as.Date(df.auto$Visit_Date_Time)
Bin Ages # Drop all rows where age is greater than 35 years or is undefined df.auto <- df.auto[!is.na(df.auto$Age),] df.auto <- df.auto[df.auto$Age <= 35,] # Bin ages df.auto$Age_binned <- cut(df.auto$Age, breaks=c(0, 15, 20, 25, 30, 35), include.lowest=T)
Histogram of Visits by Age Group ggplot(df.auto) + aes(Age_binned) + geom_histogram()
Create Summary Data Set df.auto.daily.counts <- ddply(df.auto, .(Visit_Date, Age_binned), summarize, count=length(Chief_Complaint)) ################################ # Visit_Date Age_binned count # 1 2013-02-01 [0,15] 3 # 2 2013-02-01 (15,20] 25 # 3 2013-02-01 (20,25] 16 # 4 2013-02-01 (25,30] 16 # 5 2013-02-02 [0,15] 13 # 6 2013-02-02 (15,20] 6 ###############################
Visits per Week by Age ggplot(df.auto.daily.counts) + aes(x = Visit_Date, y = count, color=Age_binned) + geom_line(size=2, alpha=.7)
Visits per Week by Age ggplot(df.auto.daily.counts) + aes(x = Visit_Date, y = count, color=Age_binned) + geom_smooth(size=3, alpha=.7)