Quick database evaluation
library(readxl)
library(tidyverse)
library(wrapr)
library(lubridate)
"""
Check columns and date time formats. Otherwise it will not work.
Also check import (na = '.', skiprows etc)
Required columns and naming:
- ID
- DV
- TAD
- AMT
- MDV
- EVID
- Date (format='%m/%d/%y')
- TIME (format='%H:%M:%S')
colist = list of covariates to test matching the covariate column names within the dataset
"""
# SETTINGS:
colist = c('GNDR','HT','WT','AGE','ALB','CRP')
date_present = 'y' # 'y' or 'n' calendar dates
clock_times = 'y' # 'y' or 'n' clock times as in H:m:S
stats = 'y' # 'y' or 'n' print stats of columns
# Read data and format
db <- read_excel("test.xlsx",na = ".")
if(clock_times == 'y'){
db <- db %>% mutate(TIME = format(TIME,'%H:%M:%S'))
}
# Create doc en start writing
zz <- file("LOG.txt","w")
writeLines('-----------------------------------------------------',con=zz,sep="\n")
writeLines("LOG FILE: CHECKING DATABASE FOR MISSING VALUES",con=zz,sep="\n")
writeLines("SDTSassen v23.06.20",con=zz,sep="\n")
writeLines('-----------------------------------------------------',con=zz,sep="\n")
writeLines("",con=zz,sep="\n")
if(date_present == 'y'){
CMT.count = 0
writeLines("CMT missing:",con=zz,sep="\n")
for(i in 1:nrow(db)){
if(is.na(db$CMT[i])){
err = paste('-',db$ID[i],db$DATE[i], db$TIME[i])
writeLines(err,con=zz,sep="\n")
CMT.count =+ 1
}
}
if(CMT.count == 0){writeLines("-None-",con=zz,sep="\n")}
writeLines('',con=zz,sep="\n")
writeLines("DV missing (MDV=0):",con=zz,sep="\n")
DV.count = 0
for(i in 1:nrow(db)){
if(is.na(db$DV[i]) & db$MDV[i]==0 & !is.na(db$MDV[i])){
err = paste('-',db$ID[i],db$DATE[i], db$TIME[i])
writeLines(err,con=zz,sep="\n")
DV.count =+ 1
}
}
if(DV.count == 0){writeLines("-None-",con=zz,sep="\n")}
writeLines('',con=zz,sep="\n")
writeLines("AMT missing (EVID=1 or 4):",con=zz,sep="\n")
AMT.count = 0
for(i in 1:nrow(db)){
if(is.na(db$AMT[i]) & (db$EVID[i]==1 | db$EVID[i]==4) & !is.na(db$EVID[i])){
err = paste('-',db$ID[i],db$DATE[i], db$TIME[i])
writeLines(err,con=zz,sep="\n")
AMT.count =+ 1
}
}
if(AMT.count == 0){writeLines("-None-",con=zz,sep="\n")}
writeLines('',con=zz,sep="\n")
writeLines("RATE missing (EVID=1 or 4):",con=zz,sep="\n")
RATE.count = 0
for(i in 1:nrow(db)){
if(is.na(db$RATE[i]) & (db$EVID[i]==1 | db$EVID[i]==4) & !is.na(db$EVID[i])){
err = paste('-',db$ID[i],db$DATE[i], db$TIME[i])
writeLines(err,con=zz,sep="\n")
RATE.count =+ 1
}
}
if(RATE.count == 0){writeLines("-None-",con=zz,sep="\n")}
EVID.count = 0
writeLines('',con=zz,sep="\n")
writeLines("EVID missing:",con=zz,sep="\n")
for(i in 1:nrow(db)){
if(is.na(db$EVID[i])){
err = paste('-',db$ID[i],db$DATE[i], db$TIME[i])
writeLines(err,con=zz,sep="\n")
EVID.count =+ 1
}
}
if(EVID.count == 0){writeLines("-None-",con=zz,sep="\n")}
writeLines('',con=zz,sep="\n")
writeLines("ID missing:",con=zz,sep="\n")
ID.count = 0
for(i in 1:nrow(db)){
if(is.na(db$ID[i])){
err = paste('-',db$ID[i],db$DATE[i], db$TIME[i])
writeLines(err,con=zz,sep="\n")
ID.count =+ 1
}
}
if(ID.count == 0){writeLines("-None-",con=zz,sep="\n")}
writeLines('',con=zz,sep="\n")
writeLines("CMT > 3:",con=zz,sep="\n")
CMT2.count = 0
for(i in 1:nrow(db)){
if(!is.na(db$CMT[i]) & db$CMT[i] >3){
err = paste('-',db$ID[i],db$DATE[i], db$TIME[i], db$CMT[i])
writeLines(err,con=zz,sep="\n")
CMT2.count =+ 1
}
}
if(CMT2.count == 0){writeLines("-None-",con=zz,sep="\n")}
writeLines('',con=zz,sep="\n")
writeLines("DATE missing:",con=zz,sep="\n")
DATE.count = 0
for(i in 1:nrow(db)){
if(is.na(db$DATE[i])){
err = paste('-',db$ID[i],db$DATE[i], db$TIME[i])
writeLines(err,con=zz,sep="\n")
DATE.count =+ 1
}
}
if(DATE.count == 0){writeLines("-None-",con=zz,sep="\n")}
writeLines('',con=zz,sep="\n")
writeLines("TIME missing:",con=zz,sep="\n")
TIME.count = 0
for(i in 1:nrow(db)){
if(is.na(db$TIME[i])){
err = paste('-',db$ID[i],db$DATE[i], db$TIME[i])
writeLines(err,con=zz,sep="\n")
TIME.count =+ 1
}
}
if(TIME.count == 0){writeLines("-None-",con=zz,sep="\n")}
}
if(date_present == 'n'){
CMT.count = 0
writeLines("CMT missing:",con=zz,sep="\n")
for(i in 1:nrow(db)){
if(is.na(db$CMT[i])){
err = paste('-',db$ID[i], db$TIME[i])
writeLines(err,con=zz,sep="\n")
CMT.count =+ 1
}
}
if(CMT.count == 0){writeLines("-None-",con=zz,sep="\n")}
writeLines('',con=zz,sep="\n")
writeLines("DV missing (MDV=0):",con=zz,sep="\n")
DV.count = 0
for(i in 1:nrow(db)){
if(is.na(db$DV[i]) & db$MDV[i]==0 & !is.na(db$MDV[i])){
err = paste('-',db$ID[i], db$TIME[i])
writeLines(err,con=zz,sep="\n")
DV.count =+ 1
}
}
if(DV.count == 0){writeLines("-None-",con=zz,sep="\n")}
writeLines('',con=zz,sep="\n")
writeLines("AMT missing (EVID=1 or 4):",con=zz,sep="\n")
AMT.count = 0
for(i in 1:nrow(db)){
if(is.na(db$AMT[i]) & (db$EVID[i]==1 | db$EVID[i]==4) & !is.na(db$EVID[i])){
err = paste('-',db$ID[i], db$TIME[i])
writeLines(err,con=zz,sep="\n")
AMT.count =+ 1
}
}
if(AMT.count == 0){writeLines("-None-",con=zz,sep="\n")}
writeLines('',con=zz,sep="\n")
writeLines("RATE missing (EVID=1 or 4):",con=zz,sep="\n")
RATE.count = 0
for(i in 1:nrow(db)){
if(is.na(db$RATE[i]) & (db$EVID[i]==1 | db$EVID[i]==4) & !is.na(db$EVID[i])){
err = paste('-',db$ID[i], db$TIME[i])
writeLines(err,con=zz,sep="\n")
RATE.count =+ 1
}
}
if(RATE.count == 0){writeLines("-None-",con=zz,sep="\n")}
EVID.count = 0
writeLines('',con=zz,sep="\n")
writeLines("EVID missing:",con=zz,sep="\n")
for(i in 1:nrow(db)){
if(is.na(db$EVID[i])){
err = paste('-',db$ID[i], db$TIME[i])
writeLines(err,con=zz,sep="\n")
EVID.count =+ 1
}
}
if(EVID.count == 0){writeLines("-None-",con=zz,sep="\n")}
writeLines('',con=zz,sep="\n")
writeLines("ID missing:",con=zz,sep="\n")
ID.count = 0
for(i in 1:nrow(db)){
if(is.na(db$ID[i])){
err = paste('-',db$ID[i], db$TIME[i])
writeLines(err,con=zz,sep="\n")
ID.count =+ 1
}
}
if(ID.count == 0){writeLines("-None-",con=zz,sep="\n")}
writeLines('',con=zz,sep="\n")
writeLines("CMT > 3:",con=zz,sep="\n")
CMT2.count = 0
for(i in 1:nrow(db)){
if(!is.na(db$CMT[i]) & db$CMT[i] >3){
err = paste('-',db$ID[i], db$TIME[i], db$CMT[i])
writeLines(err,con=zz,sep="\n")
CMT2.count =+ 1
}
}
if(CMT2.count == 0){writeLines("-None-",con=zz,sep="\n")}
writeLines('',con=zz,sep="\n")
writeLines("TIME missing:",con=zz,sep="\n")
TIME.count = 0
for(i in 1:nrow(db)){
if(is.na(db$TIME[i])){
err = paste('-',db$ID[i], db$TIME[i])
writeLines(err,con=zz,sep="\n")
TIME.count =+ 1
}
}
if(TIME.count == 0){writeLines("-None-",con=zz,sep="\n")}
}
# Check chronological order
if(date_present == 'y'){
db <- db %>% mutate(DATE = format(DATE,format='%m/%d/%y'))
db$DATE2 <- mdy(db$DATE)
dbdt <- db %>% mutate(TIME = format(TIME, format='%H:%M:%S')) %>% filter(!is.na(TIME))
dbdt$TIME2 <- hms(dbdt$TIME)
dbdt$DTTM <- dbdt$DATE2 + dbdt$TIME2
dbdt <- dbdt %>% filter(!is.na(DTTM))
writeLines('',con=zz,sep="\n")
writeLines("Check chronological order:",con=zz,sep="\n")
for(i in 2:nrow(dbdt)){
if(dbdt$ID[i] == dbdt$ID[i-1]){
if(dbdt$DTTM[i] < dbdt$DTTM[i-1]){
err = paste('-',db$ID[i-1],db$DATE[i-1], db$TIME[i-1],'gevolgd door', db$ID[i],db$DATE[i], db$TIME[i])
writeLines(err,con=zz,sep="\n")
}
}
}
}
# Other values
writeLines('',con=zz,sep="\n")
writeLines('',con=zz,sep="\n")
writeLines('',con=zz,sep="\n")
writeLines('-----------------------------------------------------',con=zz,sep="\n")
writeLines('Checking other selected columns for missing values',con=zz,sep="\n")
writeLines('-----------------------------------------------------',con=zz,sep="\n")
if(date_present == 'y'){
for(a in colist){
writeLines('',con=zz,sep="\n")
writeLines(a,con=zz,sep="\n")
loop.count = 0
for(i in 1:nrow(db)){
if(is.na(db[i,a][])){
err = paste('-',db$ID[i],db$DATE[i], db$TIME[i])
writeLines(err,con=zz,sep="\n")
loop.count =+ 1
}
}
if(loop.count == 0){writeLines("-None-",con=zz,sep="\n")}
}
} else {
for(a in colist){
writeLines('',con=zz,sep="\n")
writeLines(a,con=zz,sep="\n")
loop.count = 0
for(i in 1:nrow(db)){
if(is.na(db[i,a][])){
err = paste('-',db$ID[i],db$TIME[i])
writeLines(err,con=zz,sep="\n")
loop.count =+ 1
}
}
if(loop.count == 0){writeLines("-None-",con=zz,sep="\n")}
}
}
# Iterate over columns and summarize quick statistics
writeLines('',con=zz,sep="\n")
writeLines('-----------------------------------------------------',con=zz,sep="\n")
writeLines('Brief raw summary of columns',con=zz,sep="\n")
writeLines('-----------------------------------------------------',con=zz,sep="\n")
if(stats == 'y'){
for(i in 1:ncol(db)){
if(!is.character(db[[i]])){
nm <- colnames(db[i])
summ <- summary(db[,i])
writeLines(nm,con=zz,sep="\n")
writeLines(summ,con=zz,sep="\n")
writeLines('',con=zz,sep="\n")
}
}
} else {
writeLines('',con=zz,sep="\n")
writeLines('Print stats was set to no',con=zz,sep="\n")
}
close(zz)
# rm(AMT.count, CMT.count, CMT2.count, EVID.count, DV.count, ID.count,
# loop.count, RATE.count, summ, i, DATE.count, a, colist, err, TIME.count, zz, dbdt)
Goodness of fit plots (GoF)*
require(gridExtra)
require(lattice)
require(xpose4)
"""
# Make sure you are in the right working directory containing the tables and check run nummer
# To change 'idv' use: change.xvardef(newdb,var='idv') <- 'TAD'
# To stratify use: dv.vs.pred(newdb, by='GENDER',...)
# To take a subset use: dv.vs.pred(newdb, subset='BLOCK==1',...)
"""
new.runno <- '085'
newdb <- xpose.data(new.runno)
basic.gof(newdb)
#-------- Seperate graphs -------------#
# dv_vs_pred:
dv.vs.pred(newdb, col="black", smooth = T, pch=1, type ="p",
cex=1.2, xlb=list("Population predictions"), ylb=list("Observations"), main = NULL)
# dv_vs_ipred:
dv.vs.ipred(newdb, col="black", smooth = T, pch=1, type ="p",
cex=1.2, xlb=list("Individual predictions"), ylb=list("Observations"), main = NULL)
# |IWRES|:
absval.iwres.vs.idv(newdb, col="black", smooth = T, pch=1, type ="p",
cex=1.2, xlb=list("Time after dose (h)"), ylb=list("|IWRES|"), main = NULL)
# CWRES:
cwres.vs.idv(newdb, col="black", smooth = T, pch=1, type ="p",
cex=1.2, xlb=list("Time after dose (h)"), ylb=list("Conditional weighted residuals"), main = NULL)
#-------- Combine graphs into one figure -------------#
plot1 <- dv.vs.pred(newdb, col="black", smooth = T, pch=1, type ="p",
cex=1.2, xlb=list("Population predictions"), ylb=list("Observations"), main = NULL)
plot2 <- dv.vs.ipred(newdb, col="black", smooth = T, pch=1, type ="p",
cex=1.2, xlb=list("Individual predictions"), ylb=list("Observations"), main = NULL)
plot3 <- absval.iwres.vs.idv(newdb, col="black", smooth = T, pch=1, type ="p",
cex=1.2, xlb=list("Time after dose (h)"), ylb=list("|IWRES|"), main = NULL)
plot4 <- cwres.vs.idv(newdb, col="black", smooth = T, pch=1, type ="p",
cex=1.2, xlb=list("Time after dose (h)"), ylb=list("Conditional weighted residuals"), main = NULL)
grid.arrange(plot1, plot2, plot3, plot4, ncol=2, nrow=2)
# For log scale add: ,scales = list(y = list(log = 10), x = list(log=10))
# For limits add: ylim=c(-3,3)