setwd( "/s-schijf/swarta/VCP")
#setwd("R:/Projecten/V050242 Decontaminatie/Jurgen/paper VCP-qmra met arno/Rcode")

library( foreign )  # For reading SPSS data
library( openxlsx )
library( reshape2 )
library( ggplot2 )
library( scales )
library( xtable )
library( stats ) # for rnultinom
library( dplyr )
library( stringr )
library( scales )

source( "likelihoods.R" )
source( "functions.R" )
source( "cat1.R" )
source( "cat2.R" )
source( "cat3.R" )
source( "cat4.R" )

options(warn=0)

nMCsamples <- 250
savePDF    <- T
saveExcel  <- T

# Read codebook and data
codebook <- read.codebook()
data     <- read.process.data( codebook )
weights  <- data[,77]
data     <- data[,-77]

# For debugging: turns warnings into errors
options(warn=2)

n.questions <- ncol( data )

codes <- colnames( data ) # Identifiers of question

master.table <- data.frame( code = character(0),
                            type = character(0),
                            question = character(0),
                            dist = character(0),
                            mean = character(0),
                            median = character(0),
                            p9 = character(0),
                            p99 = character(0)
)

storage.time <- data.frame( x=character(0),
                           ymin=numeric(0),
                           ymax=numeric(0),
                           lower=numeric(0),
                           middle=numeric(0),
                           upper=numeric(0))

consumption.frequency <- data.frame( x=character(0),
                        ymin=numeric(0),
                        ymax=numeric(0),
                        lower=numeric(0),
                        middle=numeric(0),
                        upper=numeric(0))

storage.location <- data.frame( x=character(0), freq=numeric(0), desc=character(0) )

raw.consumption.table <- data.frame( product = character(0),
                                     code1 = character(0),
                                     code2 = character(0),
                                     perc.category = numeric(0),
                                     mean = character(0),
                                     median = character(0),
                                     p9 = character(0),
                                     p99 = character(0)
)




pb <- txtProgressBar(min = 1, max = n.questions, initial = 1, char = "=", width = NA, style = 3, file = "")

for( j in 1:n.questions){
  
  setTxtProgressBar(pb, j )
  code    <- codes[j]
  index   <- which( codebook$questions$CODE==codes[j] ) # Index into codebook
  type    <- codebook$questions$answertype[ index ]       # Type 1,2,3 or 4
  table.label   <- as.character( codebook$questions$table.label[ index ] )
  figure.label  <- as.character( codebook$questions$figure.label[ index ] )
  
  # Get the meaning and labels of the answer options
  answers.value <- filter( codebook$answers, CODE==code ) %>% select( ANSWER,LOW, HIGH, VALUE ) %>% arrange( ANSWER)
  answers.label <- filter( codebook$answers, CODE==code ) %>% select( ANSWER, DESC) %>% na.omit() %>% droplevels() %>% arrange(ANSWER)
  
  if( (type != 4) & ( type != 0 )  ){ #Type 4 to be handled differently
    ind <- !is.na( data[j] )
    answers <- as.numeric( data[ind,j] )  # The answers given, one line per respondent
    w <- weights[ind]
    ind <- answers %in% answers.value$ANSWER
    answers <- answers[  ind ] # Answers that don't exist anymore, we delete
    w <- w[ ind ]
  }
  

  if( type==0){
    ####
    # Headings
    ####

    master.table <- rbind( master.table, 
                           data.frame( code = table.label,
                                       type = "",
                                       question = "",
                                       dist = "",
                                       mean = "",
                                       median = "",
                                       p9 = "",
                                       p99 = ""
                                       )
                          )
  }else if( type==1){
    ####
    # Relative fractions
    ####
    df <- cat1( answers, answers.label, w, savePDF=savePDF, saveExcel=saveExcel )
    
    df <- left_join( df, answers.label %>% mutate( DESC=as.factor(DESC)), by = c("x" = "DESC")) %>% 
      arrange( ANSWER ) %>% select(-ANSWER )
    
    dist <- paste(sprintf( "%0.2f,", df$phat ), sep="", collapse="" )
    dist <- substr( dist, start=1, stop=nchar(dist)-1)
    dist <- paste( "MN(", dist, ")", sep="",collapse="" )
    
    master.table.newrow <- data.frame( code=codes[j],
                                       type=as.character(type),
                                       question=table.label,
                                       dist=dist,
                                       mean="-",
                                       median="-",
                                       p9="-",
                                       p99="-"
                                      )
    
    master.table <- rbind( master.table, master.table.newrow ) 
  } else if( type==2){
    ####
    # Relative fractions
    ####
    
    # We switch the answer categories, in order to translate fromm 'how often do you...'
    # to 'how often do you not ...', for C9.1 to C9.5 -> j=40 to j=45
    if( j>=40 & j <= 44)
    {
      tmp <- answers.value[1, 2:4]
      answers.value[1, 2:4] <- answers.value[3, 2:4]
      answers.value[3, 2:4]<-tmp
    }
    
    result <- cat2( answers, answers.value, w, answers.label, nMCsamples, savePDF=savePDF, saveExcel=saveExcel  )
    
    new.row <- new.table.row( result, label=table.label,
                              dist = sprintf( "Beta( %0.2f, %0.2f )", result$ab.hat[1],   result$ab.hat[2] ) )
      
    master.table <- rbind( master.table, new.row )
  }else if( type==3){
    ####
    # Frequencies
    ####
    result <- cat3( answers, answers.value, w, answers.label, nMCsamples, savePDF=savePDF, saveExcel=saveExcel )
    
    new.row <- new.table.row( result, label=table.label, dist = sprintf( "Gamma( %0.2f, %0.2f )", result$ss.hat[1], result$ss.hat[2] ) )
    master.table <- rbind( master.table, new.row )
    
    ss.hat <- result$ss.hat
    
    consumption.frequency <- rbind( consumption.frequency,
                            data.frame( x=table.label,
                                      ymin   = qgamma( 0.01, shape=ss.hat[1], scale=ss.hat[2] ),
                                      ymax   = qgamma( 0.99, shape=ss.hat[1], scale=ss.hat[2] ),
                                      lower  = qgamma( 0.025, shape=ss.hat[1], scale=ss.hat[2] ),
                                      middle = qgamma( 0.5, shape=ss.hat[1], scale=ss.hat[2] ),
                                      upper  = qgamma( 0.975, shape=ss.hat[1], scale=ss.hat[2] )
                          ) 
    )
    
  } else if( type==4){
    ####
    # Time span
    ####
    
    answers.value2 <- with( codebook$answers, codebook$answers[CODE==codes[j-14], c("ANSWER", "LOW","HIGH", "VALUE")] )
    answers.label2 <- droplevels( na.omit( with( codebook$answers, codebook$answers[CODE==codes[j-14], c("ANSWER","DESC")] ) ) )
    ind.na <- is.na( data[j]) | is.na( data[j-14] )

    answers <- data.frame( answers1 = as.numeric( data[!ind.na, j] ),
                           answers2 = as.numeric( data[!ind.na, j-14] ))
    w <- weights[!ind.na]
    
    ind.exist <- with(answers, (answers1 %in% answers.value$ANSWER) & (answers2 %in% answers.value2$ANSWER))
    
    answers <- answers[ind.exist,]
    w <- w[ind.exist]
    answers$w <- w
    
    answers <- merge( answers, answers.label, by.x="answers1", by.y="ANSWER" )
    answers <- merge( answers, answers.label2, by.x="answers2", by.y="ANSWER" )

    data.counts <- answers %>% group_by( DESC.x, DESC.y ) %>% summarize( Freq=sum(w) ) %>% 
      group_by( DESC.y ) %>% mutate( Freq = Freq/sum(Freq) )
    data.counts <- xtabs( Freq ~ DESC.x +DESC.y, data.counts )
    names(dimnames(data.counts)) <- c("Storage time", "Location")            
    
    write.table(  data.counts, file=paste("./Outcomes/", table.label,".txt", sep="" ) )
    
    #Actually, Jurgen is only interested in the fridge
    w <- answers[ answers$DESC.y=="fridge", "w" ]
    answers <- answers[ answers$DESC.y=="fridge", "answers1" ]
    
    result <- cat4( answers, answers.value, w, answers.label, nMCsamples, savePDF=savePDF, saveExcel=saveExcel )
    
    r.hat <- mean(result$r.hat)
    
    new.row <- new.table.row( result,  label=table.label, dist=sprintf( "Exp( 1/%0.2f )", 1/mean( result$r.hat )))
    
    master.table <- rbind( master.table, new.row )
    
    ##
    # Update storage time table
    ##
    storage.time <- rbind( storage.time,
                          data.frame( x=table.label, 
                                      ymin   = qexp( 0.01, rate=r.hat),
                                      ymax   = qexp( 0.99, rate=r.hat),
                                      lower  = qexp( 0.025, rate=r.hat),
                                      middle = qexp( 0.5, rate=r.hat),
                                      upper  = qexp( 0.975, rate=r.hat)
                          ) 
    )
  }
  
  ####
  # Update storage-location table
  ####
  if( str_detect( codes[j], "B4."))
  {
    loc.freq <- data.frame( ANSWER=answers ) %>% group_by(ANSWER) %>% summarise( freq=n() ) %>% 
                  mutate( freq=freq/sum(freq) )  %>% left_join( answers.label, by="ANSWER" )
    
    storage.location <- rbind( storage.location, data.frame( x=figure.label , freq=loc.freq$freq, desc=loc.freq$DESC) )
  }
}

###
# Plot storage location frequencies
###
storage.location <- storage.location %>% mutate( x=as.factor( x ), Location=desc )
p <- ggplot( storage.location, aes( x=x, y=freq) )
p <- p + geom_bar(  aes( fill=Location), stat="identity" )
p <- p + coord_flip()
p <- p + scale_x_discrete("")
p <- p + scale_y_continuous("Frequency", labels = percent)
p <- p + scale_fill_grey(start = 0, end = .9)
p + theme_bw()

ggsave( "storage_location.eps", path="Figuren", colormodel="RGB",
        width=30, height=10, units="cm")
        

###
# Plot storage times
###
storage.time$x <- factor( storage.time$x, 
                         levels=storage.time$x[order( storage.time$middle )])

p <- ggplot( storage.time, aes( x=x, ymin=ymin, ymax=ymax, lower=lower, middle=middle, upper=upper))
p <- p + geom_boxplot( stat="identity" )
p <- p + labs( title="Comparison of storage times",
               x="Product", y="Storage Time [days]")
p <- p + theme(axis.text.x=element_text(angle = -90, hjust = 0))
p + theme_bw()

ggsave( "storage.eps", path="Figuren", colormodel="RGB",
        width=10, height=10, units="cm")



###
# Make the raw consumption table
###
combinations <- select( codebook$rawlabels, c1,c2 ) %>% unique()
for( i in 1:nrow( combinations ) )
{
  to.combine <- as.character( combinations[i,] )
  y <- get.props.counts.labels( to.combine, codebook, data )
  raw.consumption.table <- optim.plot.add.gamma( y, to.combine, raw.consumption.table )
}



####
# Calculate percentage of 'doers'
####
codebook <- read.codebook( dont.do=TRUE )
do.prev <- numeric( n.questions )
N <- integer( n.questions )

for( j in 1:n.questions){
  index   <- which( codebook$questions$CODE==codes[j] ) # Index into codebook
  type    <- codebook$questions$answertype[ index ] # Type 1,2,3 or 4
   
  # Get the meaning and labels of the answer options
  answers.value <- filter( codebook$answers, CODE==codes[j] ) %>% select( ANSWER,LOW, HIGH, VALUE )
  answers.label <- filter( codebook$answers, CODE==codes[j] ) %>% select( ANSWER, DESC) %>% na.omit() %>% droplevels()
  
  dont.index <- answers.label[ answers.label$DESC=="don't do", "ANSWER"]
  
  if( type==4) { #Type 4 to be handled differently
    code <- j-14
    answers.value2 <- filter( codebook$answers, CODE==codes[code] ) %>% select( ANSWER,LOW, HIGH, VALUE )
    answers.label2 <- filter( codebook$answers, CODE==codes[code] ) %>% select( ANSWER, DESC) %>% na.omit() %>% droplevels()
    
    dont.index2 <- answers.label2[ answers.label2$DESC=="don't do", "ANSWER"]
    
    ind.na <- is.na( data[,j]) | is.na( data[,code] )
    
    answers <- data.frame( answers1 = as.numeric( data[!ind.na, j] ),
                           answers2 = as.numeric( data[!ind.na, code] ) )
    
    ind.exist <- with(answers, (answers1 %in% answers.value$ANSWER) & (answers2 %in% answers.value2$ANSWER))
    
    answers <- answers[ind.exist,]
    
    N[j] <- nrow( answers )
    do.prev[j] <- 1-sum( answers$answers1==dont.index |answers$answers2==dont.index2 )/N[j]
  }else if(type==0){ 
    do.prev[j] <- NA
    N[j]       <- NA
  }else{
    answers <- as.numeric( na.omit( data[j] )[,1])  # The answers given, one line per respondent
    answers <- answers[ answers %in% answers.value$ANSWER ] # Answers that don't exist anymore, we delete
    do.prev[j] <- 1-sum( answers==dont.index )/length( answers)
    N[j] <- length( answers )
  }
}

master.table <- cbind( master.table, N, prevalence=do.prev )

# Change order of columns
master.table <- master.table[, c("code","N", "prevalence", "question","type", "dist", "mean", "median","p9","p99") ]

print( xtable(master.table),  include.rownames=F, file="./Outcomes/table.html", type='html')


###
# Add do-ers to raw.consumption table
###
raw.consumption.table <- raw.consumption.table %>% mutate( code1=as.character(code1)) %>%
                          left_join( data.frame( prevalence=do.prev, code1=codes, stringsAsFactors = F)) %>%
                          mutate( total.cons = perc.category * mean * prevalence )

print( xtable(raw.consumption.table), file="./Outcomes/table.raw.html", type='html')


###
# Add do-ers to consumption.frequency data frame
###
# consumption.frequency$prevalence <- master.table[ master.table$type==3, "prevalence"]
# consumption.frequency <- droplevels( consumption.frequency[-c(17,18),] )
# consumption.frequency$total <- with( consumption.frequency, prevalence*middle )
# 
# consumption.frequency$x <- factor( consumption.frequency$x,
#                          levels=consumption.frequency$x[ order( consumption.frequency$total,decreasing=T )])
# 
# consumption.frequency1 <- consumption.frequency
# consumption.frequency2 <- consumption.frequency
# consumption.frequency3 <- consumption.frequency
# 
# consumption.frequency1$panel="Frequency"
# consumption.frequency2$panel="Prevalcence"
# consumption.frequency3$panel="Population Frequency"
# 
# consumption.frequency <- rbind( consumption.frequency1, consumption.frequency2, consumption.frequency3 )
# consumption.frequency$panel <- as.factor( consumption.frequency$panel )
# levels( consumption.frequency$panel ) <- levels( consumption.frequency$panel )[c(3,1,2)]
# 
# p <- ggplot( consumption.frequency, aes( x=x, y=prevalence, ymin=ymin, ymax=ymax, lower=lower, middle=middle, upper=upper) )
# p <- p + facet_grid( panel~., scales="free")
# p <- p + layer( data=consumption.frequency2, geom="bar", mapping=aes(y=prevalence), stat="identity", position="identity" )
# p <- p + layer( data=consumption.frequency1, geom="boxplot", stat="identity", position="identity")
# p <- p + layer( data=consumption.frequency3, geom="bar",mapping=aes(y=total), stat="identity", position="identity" )
# p <- p + labs( title="Comparison of consumption frequencies",  x="", y="Frequencies [1/year], Prevalence [-]" )
# p <- p + theme_bw()
# p <- p + theme(axis.text.x=element_text(angle = -90, hjust = 0))
# p
# 
# ggsave( "frequency.eps", path="Figuren", colormodel="RGB",
#         width=10, height=20, units="cm")
