Read in data files

Takes in both the long and wide versions of raw data.

# Add files to the correct section with the "nl_" or "nw_" prefixes in the object name. 
### Original data versions (org_) #####
#imports the data with header
 orig_SMA_DAT <- read.csv(file = "~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/SMA_N30_Single_v_All_Analysis_9_01_22_varnameupdate.csv", header=TRUE)
# #we'll only be taking participants 6 and 7 from the following data set
orig_SMA_DAT_suppl <- read.csv(file = "~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/BATCH1&2_SMA_EXPORT_20221206_varnameupdate.csv", header=TRUE)


### Newer long version of data (non-updated program; nl_) ####
nl_SMA_DAT_b2 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/Cleaned_Spoden_Batch2_AugSept.csv") #
nl_SMA_DAT_b3 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/Cleaned_OctNov_Batch_Spoden .csv") #
#this one had "complete" for first and last name - update
nl_SMA_DAT_b3$firstname <- nl_SMA_DAT_b3$medicalrecordnumber
nl_SMA_DAT_b3$lastname <- nl_SMA_DAT_b3$medicalrecordnumber
  
nl_SMA_DAT_b4 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/Cleaned_Spoden_June_Batch_SMA.csv") #
nl_SMA_DAT_b5 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/Cleaned_Klein_1.csv") #
nl_SMA_DAT_b6 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/Cleaned Batch 3 Swallowtail Ratings Dembroski.csv") #
nl_SMA_DAT_b7 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/Cleaned Batch 2 Swallowtail Ratings Dembroski.csv") #
nl_SMA_DAT_b8 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/Dembroski_1_cleaned.csv") #
#Add new items here, with nl_dataobjectname <- read.csv("SMA_Data/raw_data/FILENAME.csv")
nl_SMA_DAT_b9 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/Cleaned_Batch 4 Swallowtail Ratings Dembroski.csv") #
nl_SMA_DAT_b10 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/spodenbatch5_cleaned.csv") #
nl_SMA_DATb11 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/Cleaned_BATCH1&2_SMA_EXPORT_Spoden.csv")
nl_SMA_DATb12 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/Cleaned_Spoden_Batch2_AugSept.csv")#
nl_SMA_DATb13 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/SPODEN_SMA_251_233_REDO.csv") #


### Newer wide version of data (updated program; nw_) ####
nw_SMA_DAT_zb1 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/Cleaned_Zoeller_1.csv")#
nw_SMA_DAT_zb2 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/Cleaned_Turski_Batch_2.csv") #
nw_SMA_DAT_zb3 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/Cleaned_Turski_Batch_3.csv") #
nw_SMA_DAT_zb4 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/Cleaned_Turski_Batch_4.csv") #
nw_SMA_DAT_zb5 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/Cleaned_Klein_2.csv") #
nw_SMA_DAT_zb6 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/Cleaned_BATCH_1_Export 9-4-23 Ochura.csv") #
nw_SMA_DAT_zb7 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/Cleaned_Turski_Batch_5.csv") #
nw_SMA_DAT_zb8 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/CleanedBatch_4_Nov Ochura J.csv") #
nw_SMA_DAT_zb9 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/CleanedOchuraBATCH_2.csv") #
nw_SMA_DAT_zb10 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/CleanedOchuraBatch 3.csv") #removed the last few blank lines from this one
#Add new items here, with nw_dataobjectname <- read.csv("SMA_Data/raw_data/FILENAME.csv")
nw_SMA_DAT_zb11 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/Cleaned_IW_SMA_RELIABILITY2.csv") #
nw_SMA_DAT_zb12 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/Cleaned_IWilson-Batch1.csv") #
nw_SMA_DAT_zb13 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/Cleaned_McGhee_Batch1.csv") #
nw_SMA_DAT_zb14 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/Cleaned_Turski_7.csv") #
nw_SMA_DAT_zb15 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/Cleaned_Turski_Batch_6.csv") #
nw_SMA_DAT_zb16 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/McGhee_Batch2_Cleaned.csv") #
nw_SMA_DAT_zb17 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/Wilson_Batch3_Cleaned.csv") #
nw_SMA_DAT_zb18 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/WilsonCleaned_SMA_batch_2.csv") #
nw_SMA_DAT_zb19 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/IW_SMABatch4 (1) (1).csv") #
nw_SMA_DAT_zb20 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/IW_BATCH1ReDo_cleaned (1).csv")#
nw_SMA_DAT_zb21 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/AS_SMA_29.csv") #
nw_SMA_DAT_zb22 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/IW_SMA315_320.csv") #
nw_SMA_DAT_zb23 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/HM_July2025.csv") #
nw_SMA_DAT_zb24 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/ChanSMA_325_331.csv") #
nw_SMA_DAT_zb25 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/spodenSMA_324_331_323_322_321.csv") #
nw_SMA_DAT_zb26 <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_Data/raw_data/spodenSMAextra_Cleaned.csv") #

#read in file name/object pairing (above)
filenamekey <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/SMA_kinematics_filenames.csv")

### SMA chart data #####
# chartdata <- read.csv(file="~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/577565_McGrattan_SMA/BABYVFSSIMP/data/SMA_clean_chartReview_data_20250623.csv")
# analysisdata <- read.csv(file="~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/577565_McGrattan_SMA/BABYVFSSIMP/data/SMA_clean_analysis_data_20250623.csv")
# fois_sec <- read.csv(file="~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/577565_McGrattan_SMA_BabyVFSSImP/BABYVFSSIMP/data/SMA_fois_sec_combined.csv") 
load("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/577565_McGrattan_SMA/BABYVFSSIMP/data/SMA_analysis_files.Rdata")

#log to connect subject id to exam #
idlog <- read.csv("~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/577565_McGrattan_SMA/BABYVFSSIMP/data/Clean_ID_log.csv")

Clean data to match original format

Long data –> original format

Wide data –> original format

### nw --> orig ####
#get list of new wide datasets
nwtoconvert <- ls()[grepl("nw_", ls())]

#for each: 
for (nw in nwtoconvert) {
  temp <- get(nw)
  
  #merge into long verion
  temp <- temp %>% 
    mutate(across(.cols = B1:BCR, .fns = as.character)) %>% 
    pivot_longer(cols = B1:BCR,
                 names_to = "Measurement_name", 
                 values_to = "value") %>% 
    #add units to new column and remove from value
    mutate(unit = case_when(grepl("s", value) ~ "s", 
                            grepl("cm", value) ~ "cm", 
                            TRUE ~ ""), 
           value = gsub("s|cm", "", value))
  
 
  #add variable about graph type
  graph_ref <- nl_SMA_DAT_b2 %>% 
    group_by(graph, Measurement_name) %>% 
    tally()
  
  #update measurement names to match
  temp$Measurement_name[which(temp$Measurement_name == "Airwaycl")] <- "AIRWAYcl"
  temp$Measurement_name[which(temp$Measurement_name == "BP1.Aecl")] <- "BP1AEcl"
  temp$Measurement_name[which(temp$Measurement_name == "PCL")] <- "Pcl"
  #units to s
  
  temp$unit[which(temp$Measurement_name == "PESm.sec.")] <- "s"
  temp$unit[which(temp$Measurement_name == "PESm.disp.")] <- "cm"
  temp$Measurement_name[(grepl("PESm", temp$Measurement_name))] <- "PESm"
  
  #add in graph variable based on matching measurements
  temp$graph <- graph_ref$graph[match(temp$Measurement_name, graph_ref$Measurement_name)]
  
  #select relevant columns
  temp <- subset(temp, select = c("MRN", "Assessor", "Assessment.Date", "Pt..Name", "DOB", "name", "graph", "Measurement_name", "value", "unit"))
  
  #make dateofassesment and performed by NA (will be best and worst swallow)
  temp$Assessment.Date <- NA
  temp$Pt..Name <- NA
  temp$DOB <- NA
  temp$norm <- NA
  temp$sd <- NA
  
  #convert names to match
  names(temp) <- names(orig_SMA_DAT)
  
  #save back to object
  assign(nw, temp)
  
}



#get current date to use in file names when saving data in lines below
curdat <- gsub('-','',Sys.Date()) #gsub replaces dashes in Sys.Date() output with blanks

COMBINE DATASETS

## pull participants 6 & 7 from SMA_DAT_suppl 
orig_SMA_DAT_suppl67only <- orig_SMA_DAT_suppl[which(orig_SMA_DAT_suppl$Participant_Num %in% c(6,7)),]

## check 
#View(orig_SMA_DAT_suppl67only)

## get all datasets (only use newer data for now)
SMA_DAT_list <- mget(grep("nl_|nw_", ls(), value=T))

#combine into one
SMA_DAT <- do.call("rbind", SMA_DAT_list)

#add variable with dataset
SMA_DAT$datafile <- gsub("\\..*", " ", row.names(SMA_DAT))

#add in variable that denotes dataset type
SMA_DAT$DataSetType <- unlist(lapply(strsplit(row.names(SMA_DAT), "_"), function(x) x[1]))

#check if there are duplicates in the data (rows exactly duplicated)
summary(duplicated(SMA_DAT))
SMA_DAT$Isdup <- duplicated(SMA_DAT)

#remove exact duplicates (caused by reading in the data multiple times)
SMA_DAT1 <- filter(SMA_DAT, Isdup == FALSE)




#create table of types by graph to make sure all the data are right
SMA_measurementtypes <- SMA_DAT1 %>% 
  group_by(DataSetType, graph, Measurement_name, units) %>% 
  tally() %>% 
  select(-n) %>% 
  pivot_wider(names_from = DataSetType, 
              values_from = graph)

unique(SMA_DAT1$Participant_Num)

Clean participant numbers

SMA_DAT1$Participant_Num[which(SMA_DAT1$Participant_Num=="eight")] <- 8
SMA_DAT1$Participant_Num[which(SMA_DAT1$Participant_Num=="four")] <- 4
SMA_DAT1$Participant_Num[which(SMA_DAT1$Participant_Num=="fourteen")] <- 14
SMA_DAT1$Participant_Num[which(SMA_DAT1$Participant_Num=="thirty-one")] <- 31
SMA_DAT1$Participant_Num[which(SMA_DAT1$Participant_Num=="Two Eighteen")] <- 218
SMA_DAT1$Participant_Num[which(SMA_DAT1$Participant_Num=="Two Twenty Seven")] <- 227
SMA_DAT1$Participant_Num[which(SMA_DAT1$Participant_Num=="One Sixty Six")] <- 166
SMA_DAT1$Participant_Num[which(SMA_DAT1$Participant_Num=="One Fifty Nine")] <-159
SMA_DAT1$Participant_Num[which(SMA_DAT1$Participant_Num=="Two Hundred Eight")] <- 208
SMA_DAT1$Participant_Num[which(SMA_DAT1$Participant_Num=="Two Nineteen")] <- 219
SMA_DAT1$Participant_Num[which(SMA_DAT1$Participant_Num=="Two Forty Nine")] <- 249
SMA_DAT1$Participant_Num[which(SMA_DAT1$Participant_Num=="One Fifty Seven")] <- 157
SMA_DAT1$Participant_Num[which(SMA_DAT1$Participant_Num=="One Fifty Two")] <- 152
SMA_DAT1$Participant_Num[which(SMA_DAT1$Participant_Num=="One Sixty Two")] <- 162
SMA_DAT1$Participant_Num[which(SMA_DAT1$Participant_Num=="Three thirty two")] <- 332
SMA_DAT1$Participant_Num[which(SMA_DAT1$Participant_Num=="Three Thirty Three")] <- 333
SMA_DAT1$Participant_Num[which(SMA_DAT1$Participant_Num=="Three Thirty Four")] <- 334
SMA_DAT1$Participant_Num[which(SMA_DAT1$Participant_Num=="Three Thirty Five")] <- 335
SMA_DAT1$Participant_Num[which(SMA_DAT1$Participant_Num=="Three Thirty Six")] <- 336
SMA_DAT1$Participant_Num[which(SMA_DAT1$Participant_Num=="Three Forty")] <- 340
SMA_DAT1$Participant_Num[which(SMA_DAT1$Participant_Num=="Three Thirty Nine")] <- 339
SMA_DAT1$Participant_Num[which(SMA_DAT1$Participant_Num=="Three Thirty Eight")] <- 338
SMA_DAT1$Participant_Num[which(SMA_DAT1$Participant_Num=="Three Twenty Six")] <- 326
SMA_DAT1$Participant_Num[which(SMA_DAT1$Participant_Num=="Three Forty Two")] <- 342
SMA_DAT1$Participant_Num[grepl("121", SMA_DAT1$Participant_Num)] <- 121
SMA_DAT1$Participant_Num[grepl("Swallow", SMA_DAT1$Participant_Num)] <- gsub("Swallow ", "", SMA_DAT1$Participant_Num[grepl("Swallow", SMA_DAT1$Participant_Num)])
SMA_DAT1$Participant_Num[grepl("p", SMA_DAT1$Participant_Num)] <- gsub("p", "", SMA_DAT1$Participant_Num[grepl("p", SMA_DAT1$Participant_Num)])
SMA_DAT1$Participant_Num[which(SMA_DAT1$Participant_Num=="03")] <- 3
SMA_DAT1$Participant_Num[grepl("1a|1b", SMA_DAT1$Participant_Num)] <- 1
SMA_DAT1$Participant_Num[grepl("a", SMA_DAT1$Participant_Num)] <- gsub("a", "", SMA_DAT1$Participant_Num[grepl("a", SMA_DAT1$Participant_Num)])
SMA_DAT1$Participant_Num[grepl("video", SMA_DAT1$Participant_Num)] <- 325
SMA_DAT1$Participant_Num[which(is.na(SMA_DAT1$Participant_Num) & SMA_DAT1$Participant == 300)] <- 300


unique(SMA_DAT1$Participant_Num)

#which has a name?
#use Participant instead
SMA_DAT1 %>%
  filter(grepl("Wilson", Participant_Num)) %>%
  group_by(Participant) %>%
  slice_head(n=1) 

SMA_DAT1$Participant_Num[which(SMA_DAT1$Participant_Num=="Irena Wilson")] <- SMA_DAT1$Participant[which(SMA_DAT1$Participant_Num=="Irena Wilson")]

SMA_DAT1$Participant_Num[which(SMA_DAT1$Participant_Num=="Iren Wilson")] <- SMA_DAT1$Participant[which(SMA_DAT1$Participant_Num=="Iren Wilson")]


#which is "no swallows"
SMA_DAT1 %>%
  filter(Participant_Num == "no swallows") %>%
  group_by(Participant) %>%
  slice_head(n=1) #should be removed

#which is "no swllows
SMA_DAT1 %>%
  filter(Participant_Num == "no swllows") %>%
  group_by(Participant) %>%
  slice_head(n=1)

SMA_DAT1$Participant_Num[which(SMA_DAT1$Participant_Num=="no swllows")] <- SMA_DAT1$Participant[which(SMA_DAT1$Participant_Num=="no swllows")]


#which is "Comlete"
#SMA_DAT1[which(SMA_DAT1$Participant_Num == "Comlete"),] #in participant
SMA_DAT1 %>%
  filter(Participant_Num == "Comlete") %>%
  group_by(Participant) %>%
  slice_head(n=1)

SMA_DAT1$Participant_Num[which(SMA_DAT1$Participant_Num=="Comlete")] <- SMA_DAT1$Participant[which(SMA_DAT1$Participant_Num=="Comlete")]


#what is the "" row?
SMA_DAT1[which(SMA_DAT1$Participant_Num == ""),] #substitute participant
SMA_DAT1$Participant_Num[which(SMA_DAT1$Participant_Num == "")] <- SMA_DAT1$Participant[which(SMA_DAT1$Participant_Num == "")]


#what is the NA row?
SMA_DAT1[which(is.na(SMA_DAT1$Participant_Num)),] #blank case, will remove

unique(SMA_DAT1$Participant_Num)

Check if any values in “Participant” do not match “Participant_Num”

#check to see if Participant and Participant_Num don't match
SMA_DAT1 %>% 
  select(Participant, Participant_Num, datafile) %>% 
  mutate(match = ifelse(Participant == Participant_Num, 1, 0)) %>% 
  filter(match == 0) %>% 
  group_by(Participant, Participant_Num) %>% 
  slice_head(n=1) 

#Participant 107 should be 75
SMA_DAT1$Participant_Num[which(SMA_DAT1$Participant_Num=="107")] <- SMA_DAT1$Participant[which(SMA_DAT1$Participant_Num=="107")]



SMA_DAT1 <- filter(SMA_DAT1, 
                   !is.na(Participant_Num), 
                   Participant_Num != "no swallows", 
                   Participant_Num != "")

# SMA_DAT1 %>% 
#   filter(Participant_Num %in% c(16, 23, 11)) %>% 
#   group_by(Participant_Num) %>% 
#   select(Participant_Num, Participant, datafile) %>% 
#   slice_head(n=1)

Check participant by file to make sure the correct data are included. Need to update data to keep newest for 288.

part_by_file <- SMA_DAT1 %>% 
  group_by(datafile, Participant_Num) %>% 
  tally() %>% 
  arrange(Participant_Num) %>% 
  full_join(filenamekey, by=c("datafile" = "object")) %>% 
  mutate(Included = ifelse(Participant_Num %in% analysis_included_lastexam$studyid_clean, 1, 0))

#write.csv(part_by_file, file="Summary_of_Kinematic_files.csv", row.names = F)

#reliability?
dupids <- part_by_file$Participant_Num[which(duplicated(part_by_file$Participant_Num))]

#remove earlier data from participant 288
SMA_DAT1 <- SMA_DAT1[-which(SMA_DAT1$Participant_Num==288 & SMA_DAT1$datafile == "nl_SMA_DAT_b10 "),]

Add IDs and Merge chart data

Check matches using participant number

#check studyID matches
summary(SMA_DAT1$Participant_Num %in% analysis_included$studyid_clean)
# They do not all match... 

(nomatch <- unique(SMA_DAT1$Participant_Num[which(SMA_DAT1$Participant_Num %in% analysis_included$studyid_clean == FALSE)]))

#which files are they from?
nomatch_dat <- SMA_DAT1[which(SMA_DAT1$Participant_Num %in% nomatch),]

nomatch_dat %>% 
  group_by(datafile, Participant_Num) %>% 
  tally()

#only 223 should be in there, rest are duplicates. 

#other way?
summary(analysis_included$studyid_clean %in% SMA_DAT1$Participant_Num)
unique(analysis_included$studyid_clean[which(analysis_included$studyid_clean %in% SMA_DAT1$Participant_Num == FALSE)])[order(unique(analysis_included$studyid_clean[which(analysis_included$studyid_clean %in% SMA_DAT1$Participant_Num == FALSE)]))]
#236 - this participant had no swallows

Add subject id to exams using BabyVFSSimP id log (cleaned and written out in the BabyVFSSimP data cleaning script)

#add subject id to exams
summary(SMA_DAT1$Participant_Num %in% idlog$studyid_clean)

SMA_DAT1$Subject.ID <- idlog$Subject.ID[match(SMA_DAT1$Participant_Num, idlog$studyid_clean)]

Check participant inclusion

Check treated included

analysis_included$In_Kinematics <- ifelse(analysis_included$Subject.ID %in% SMA_DAT1$Subject.ID, 1, 0)

(t_notinclude <- analysis_included %>% 
    group_by(Subject.ID) %>% 
    summarize(Included = sum(In_Kinematics)) %>% 
    #find any not included
    filter(Included == 0))

Add group to the SMA data

#merge with treatment data
treatmentinfor_tomerge <- analysis_included %>% 
  select(studyid_clean, treat_group)

#merge datasets
SMA_DAT2 <- SMA_DAT1 %>% 
  mutate(Participant_Num = as.numeric(Participant_Num)) %>% 
  inner_join(treatmentinfor_tomerge, by=c("Participant_Num" = "studyid_clean")) 
  
SMA_DAT2 %>% 
  group_by(treat_group) %>% 
  summarize(n = n_distinct(Subject.ID))

SMA_DAT2_LastExam <- SMA_DAT2 %>% 
  filter(Participant_Num %in% analysis_included_lastexam$studyid_clean)

Partition out reliability exams and keep only consensus in analysis

part_by_file_up <- read_excel(path = "~/Library/CloudStorage/Box-Box/CLA RSS Data Sharing/1022295_McGrattan_SMA_KINEMATICS/McGrattan_SMA_Scripts/Summary_of_Kinematic_files_addinfo.xlsx")

relib_files <- part_by_file_up %>% 
  filter(Reliability == 1, Participant_Num %in% analysis_included_lastexam$studyid_clean)

con_files <- part_by_file_up %>% 
  filter(Concensus == "x", Participant_Num %in% analysis_included_lastexam$studyid_clean)

SMA_reliability <- SMA_DAT2_LastExam %>% 
  filter(paste(Participant_Num, trimws(datafile)) %in% paste(relib_files$Participant_Num, relib_files$datafile))

SMA_consensus <- SMA_DAT2_LastExam %>% 
  filter(paste(Participant_Num, trimws(datafile)) %in% paste(con_files$Participant_Num, con_files$datafile))

#remove reliability and consensus cases, then add the consensus back in. 
SMA_DAT2_LastExam1 <- SMA_DAT2_LastExam %>% 
  filter(Participant_Num %in% unique(c(SMA_consensus$Participant_Num, SMA_reliability$Participant_Num)) == F) %>% 
  bind_rows(SMA_consensus)

DATA SUMMARY BY PARTICIPANT

Create data subsets with only the measures of interest. Using Last exam data for now.

#Making a sub dataset with only rows that have these variables
#writing out all participants now
SMA_DAT_sub <- SMA_DAT2_LastExam1 %>%
  filter( Measurement_name == 'PCR' | Measurement_name == "OPT"  | Measurement_name == "HPT" | Measurement_name == "TPT" | Measurement_name == "PESop") %>% 
  mutate(value = as.numeric(value))

#Changing PESop from factor to character bc things were wonky for the next steps
SMA_DAT_sub$Measurement_name <- as.character(SMA_DAT_sub$Measurement_name)

#same for reliabiliy
#writing out all participants now
SMA_DAT_sub_relib <- SMA_reliability %>%
  filter( Measurement_name == 'PCR' | Measurement_name == "OPT"  | Measurement_name == "HPT" | Measurement_name == "TPT" | Measurement_name == "PESop") %>% 
  mutate(value = as.numeric(value))

#Changing PESop from factor to character bc things were wonky for the next steps
SMA_DAT_sub_relib$Measurement_name <- as.character(SMA_DAT_sub_relib$Measurement_name)

Summarize data within each participant. Finding the min, max, standard deviation and outputting into a file called SMA DAT SUM. Also removing any NA’s from analysis.

This is grouped by SUBJECT ID for the last exam.

SMA_DAT_summary <- SMA_DAT_sub %>%
  group_by(Subject.ID, Measurement_name, treat_group) %>%
  dplyr::summarise(Lowest_Every= min(value, na.rm = T),
                   Highest_Every = max(value, na.rm = T),
                   Mean = mean(value, na.rm = T),
                   STD = sd(value, na.rm = T),
                   n_obvs = n(), 
                   fivep = ceiling(n_obvs*0.05),
                   Lowest_5p = mean(Small(value, k=fivep), na.rm=T),
                   Highest_5p = mean(Large(value, k=fivep), na.rm=T),
                   n_visit = n_distinct(Participant_Num)) %>% #get total number of observations for each participant
  #create OI scores - Worst is highest of all except for PES which is lowest
  mutate(Worst_OI = case_when(Measurement_name == "PESop" ~ Lowest_Every, 
                              Measurement_name != "PESop" ~ Highest_Every), 
         Best_OI = case_when(Measurement_name == "PESop" ~ Highest_Every, 
                             Measurement_name != "PESop" ~ Lowest_Every), 
         Worst_OI_5p = case_when(Measurement_name == "PESop" ~ Lowest_5p, 
                              Measurement_name != "PESop" ~ Highest_5p), 
         Best_OI_5p = case_when(Measurement_name == "PESop" ~ Highest_5p, 
                             Measurement_name != "PESop" ~ Lowest_5p))

## `summarise()` has grouped output by 'Subject.ID',
## 'Measurement_name'. You can override using the `.groups`
## argument.

#replace -inf and inf with NA
SMA_DAT_summary[SMA_DAT_summary == -Inf] <- NA
SMA_DAT_summary[SMA_DAT_summary == Inf] <- NA

#review table
#View(SMA_DAT_summary)

#for reliability
SMA_DAT_summary_relib <- SMA_DAT_sub_relib %>%
  group_by(Subject.ID, datafile, Measurement_name, treat_group) %>%
   dplyr::summarise(Lowest_Every= min(value, na.rm = T),
                   Highest_Every = max(value, na.rm = T),
                   Mean = mean(value, na.rm = T),
                   STD = sd(value, na.rm = T),
                   n_obvs = n(), 
                   fivep = ceiling(n_obvs*0.05),
                   Lowest_5p = mean(Small(value, k=fivep), na.rm=T),
                   Highest_5p = mean(Large(value, k=fivep), na.rm=T),
                   n_visit = n_distinct(Participant_Num)) %>% #get total number of observations for each participant
  #create OI scores - Worst is highest of all except for PES which is lowest
  mutate(Worst_OI = case_when(Measurement_name == "PESop" ~ Lowest_Every, 
                              Measurement_name != "PESop" ~ Highest_Every), 
         Best_OI = case_when(Measurement_name == "PESop" ~ Highest_Every, 
                             Measurement_name != "PESop" ~ Lowest_Every), 
         Worst_OI_5p = case_when(Measurement_name == "PESop" ~ Lowest_5p, 
                              Measurement_name != "PESop" ~ Highest_5p), 
         Best_OI_5p = case_when(Measurement_name == "PESop" ~ Highest_5p, 
                             Measurement_name != "PESop" ~ Lowest_5p))

## `summarise()` has grouped output by 'Subject.ID', 'datafile',
## 'Measurement_name'. You can override using the `.groups`
## argument.

#replace -inf and inf with NA
SMA_DAT_summary_relib[SMA_DAT_summary_relib == -Inf] <- NA
SMA_DAT_summary_relib[SMA_DAT_summary_relib == Inf] <- NA

Review dimensions

#review dimensions; each participant should only have 5 measurements; LOOKS GOOD 
table(SMA_DAT_summary$Subject.ID, SMA_DAT_summary$Measurement_name)

Export data

Export cleaned data to use for analysis.

# #export GENERAL SUMMARY  to csv (file showing min, max, std, n_observations for each participant every swallow analysis)
# write.csv(SMA_DAT_summary, paste0('../SMA_Data/SMA_DAT_summary_last',curdat,'.csv'), row.names=FALSE)
# 
# #write out the data file
# write.csv(SMA_DAT_sub, paste0('../SMA_Data/SMA_DAT_combined_last',curdat,'.csv'), row.names = FALSE)
# 
# #write out the reliability file
# write.csv(SMA_DAT_summary_relib, paste0('../SMA_Data/SMA_DAT_reliab_last',curdat,'.csv'), row.names = FALSE)

Session Information

sessionInfo()

## R version 4.5.1 (2025-06-13)
## Platform: x86_64-apple-darwin20
## Running under: macOS Sonoma 14.4.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Chicago
## tzcode source: internal
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] DescTools_0.99.60 knitr_1.50        vcd_1.4-13        readxl_1.4.5     
## [5] tidyr_1.3.1       ggplot2_3.5.2     dplyr_1.1.4      
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.10        utf8_1.2.6         generics_0.1.4     class_7.3-23      
##  [5] lattice_0.22-7     hms_1.1.3          digest_0.6.37      magrittr_2.0.3    
##  [9] evaluate_1.0.4     RColorBrewer_1.1-3 mvtnorm_1.3-3      fastmap_1.2.0     
## [13] Matrix_1.7-3       cellranger_1.1.0   jsonlite_2.0.0     e1071_1.7-16      
## [17] httr_1.4.7         purrr_1.0.4        scales_1.4.0       jquerylib_0.1.4   
## [21] cli_3.6.5          expm_1.0-0         rlang_1.1.6        withr_3.0.2       
## [25] cachem_1.1.0       yaml_2.3.10        rootSolve_1.8.2.4  tools_4.5.1       
## [29] tzdb_0.5.0         lmom_3.2           gld_2.6.7          Exact_3.3         
## [33] colorspace_2.1-1   forcats_1.0.0      pacman_0.5.1       boot_1.3-31       
## [37] vctrs_0.6.5        R6_2.6.1           zoo_1.8-14         proxy_0.4-27      
## [41] lifecycle_1.0.4    fs_1.6.6           MASS_7.3-65        pkgconfig_2.0.3   
## [45] pillar_1.10.2      bslib_0.9.0        gtable_0.3.6       Rcpp_1.0.14       
## [49] glue_1.8.0         data.table_1.17.6  haven_2.5.5        xfun_0.52         
## [53] tibble_3.3.0       lmtest_0.9-40      tidyselect_1.2.1   rstudioapi_0.17.1 
## [57] farver_2.1.2       htmltools_0.5.8.1  rmarkdown_2.29     readr_2.1.5       
## [61] compiler_4.5.1

0_SMA_Kinematics_dataprep

Alicia Hofelich Mohr

2025-08-11