## -----------------------------------------------------------------------------
library(agro)
ffbase <- get_data_from_uri("https://data.cipotato.org/dataset.xhtml?persistentId=doi:10.21223/P3/QWFBBZ", ".")
head(ffbase)
ffend <- get_data_from_uri("https://data.cipotato.org/dataset.xhtml?persistentId=doi:10.21223/P3/ZQJNRS", ".")
head(ffend)


## ----read in the data, warning = F--------------------------------------------
ff1 <- grep('\\.dta$', ffbase, value=TRUE)
#the data is stored as .dta from stata, so we need to use the package `readstata123`
library(readstata13)
x <- lapply(ff1, read.dta13, generate.factors=T, nonint.factors=T)
names(x) <- basename(ff1)

ff2 <- grep('\\.dta$', ffend, value=TRUE)
x2 <- lapply(ff2, read.dta13, generate.factors=T, nonint.factors=T)
names(x2) <- basename(ff2)


## ----belief variables---------------------------------------------------------
belief <- x2$`mbend_pg19_genderbased_attitude and perception_part1.dta`
#we only want observations below 627, since these are the ones that also appeared in the baseline. 
belief <- belief[belief$hhid < 627,c(4, 9, 14:20, 34)]
colnames(belief) <- c("hhid", "health", "taste", "yield", "storability", "sweetness", "maturity", "child", "disease", "color")

#We need to switch the variables from the Linkert scale in which they were interviewed to 0-1 variables. 
belief$health <- ifelse(belief$health == "Strongly agree" | belief$health == "Agree", 1, 0)
belief$taste <- ifelse(belief$taste == "Disagree" | belief$taste == "Strongly disagree", 1, 0)
belief$yield <- ifelse(belief$yield == "Strongly agree" | belief$yield == "Agree", 1, 0)
belief$storability <- ifelse(belief$storability == "Disagree" | belief$storability == "Strongly disagree", 1, 0)
belief$sweetness <- ifelse(belief$sweetness == "Disagree" | belief$sweetness == "Strongly disagree", 1, 0)
belief$child <- ifelse(belief$child == "Disagree" | belief$child == "Strongly disagree", 1, 0)
belief$disease <- ifelse(belief$disease == "Disagree" | belief$disease == "Strongly disagree",1, 0)
belief$maturity <- ifelse(belief$maturity == "Agree" | belief$maturity == "Strongly agree", 1, 0)
belief$color <- ifelse(belief$color == "Disagree" | belief$color == "Strongly disagree", 1, 0)

#mean and standard deviation of each variable
apply(belief[,-1], 2, mean)
apply(belief, 2, sd)


## ----outcome variables--------------------------------------------------------
sp <- x2$mbend_pg5_sweetpotato_production_table.dta
sp <- sp[sp$hhid < 627,c(4,12)]
colnames(sp) <- c("hhid", "OFSP")
sp$OFSP <- ifelse(sp$OFSP == "Yes", 1, 0)
#The data was recorded on a plot level, and we want household level--so we aggregate.
sp2 <- aggregate(sp$OFSP, list(sp$hhid), sum)
colnames(sp2) <- c("hhid", "OFSP")
sp2$OFSP[sp2$OFSP > 0] <- 1
summary(sp2$OFSP)

change <- x2$mbend_pg6_sp_production_cont.dta
change <- change[change$hhid < 627, c(4,7,10)]
#one of the responses was 50, which is an error, so it is converted to 10.
change$j27c[change$j27c>10] <- 10

change$change <- (change$j27c-change$j27f)/10
summary(change$change)
change <- change[,c(1,4)]

#add the outcome variables to the belief variables dataframe
allvar <- merge(sp2, change, by = "hhid")
allvar <- merge(allvar, belief, by = "hhid")

#Now we can 
means <- round(apply(allvar[,-1], 2, mean, na.rm=T), 2)
knitr::kable(means)


## -----------------------------------------------------------------------------
library(nFactors)
belief <- belief[,-1]
ev <- eigen(cor(belief))
ap <- parallel(subject = nrow(belief), var=ncol(belief), rep=100, cent = .05)
nS <- nScree(x=ev$values, aparallel = ap$eigen$qevpea)
plotnScree(nS)




## ----control variables--------------------------------------------------------
dem <- x$mbbase_pg2_adult_demog.dta
demhhh <- dem[dem$D3=="Head",c(1, 3,5,8)]
colnames(demhhh) <- c("hhid", "sex", "age", "edu")
#hhhsex d2
demhhh$sex <- ifelse(demhhh$sex == "Male", 1, 0)
#hhhage 2009-d4
demhhh$age <- 2009-demhhh$age
#hhhedu above primary level d7
levels(demhhh$edu)<- c(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)
demhhh$edu <- as.numeric(as.character(demhhh$edu))

#infants number below five
dem2 <- x$mbbase_pg2_adult_demog.dta
dem2 <- dem2[,c(1,5)]
colnames(dem2) <- c("hhid", "age")
dem2$age <- 2009 - dem2$age
dem2$infants <- ifelse(dem2$age < 6, 1, 0)
#demhhh$infants <- aggregate(dem2$infants, list(dem2$hhid), sum)

#with the child data
child <- x$mbbase_pg3_child_demog.dta
child <- child[,c(1,6)]
colnames(child) <- c("hhid", "child")
child$child <- ifelse(child$child == "no infant", 0, 1)
child2 <- aggregate(child$child, list(child$hhid), sum)
colnames(child2) <- c("hhid", "child")
demhhh$infants <- child2$child

#noninfants number above five
dem2$noninfants <- ifelse(dem2$age > 5, 1, 0)
dem3 <- aggregate(dem2$noninfants, list(dem2$hhid), sum)
demhhh$noninfants <- dem3$x

#farmsize
farmsize <-x$`mbbase_pg5_crop production.dta`
farmsize <- farmsize[,c(1,4:6)]
farmsize$size <- farmsize$P02A + farmsize$P02B
demhhh$farmsize <- farmsize$size

#soldSP
#TODO: Find info from baseline, this is endline
sold <- x2$mbend_pg6_sp_production_cont.dta
sold <- sold[,c(4,11)]
colnames(sold) <- c("hhid", "sold")
sold$sold <- ifelse(sold$sold=="None at all", 0, 1)
#demhhh$sold <- sold$sold

#labour 
labor <- x$mbbase_pg8_labour_and_credit.dta
labor <- labor[, c(1,5)]
colnames(labor) <- c("hhid", "labor")
labor$labor <- ifelse(labor$labor == "Only hired labor", 1, 0)
demhhh$labor <- labor$labor

#credit
credit <- x$mbbase_pg8_labour_and_credit.dta
credit <- credit[,c(1, 36, 37)]
colnames(credit) <- c("hhid", "applied", "received")
credit$credit <- ifelse(credit$applied == "Yes" & credit$received == "Yes", 1, 0)
demhhh$credit <- credit$credit

#region
mwanza <- x$mbbase_pg1_cover_page.dta$A01
mwanza <- ifelse(mwanza == "Mwanza", 1, 0)
demhhh$mwanza <- mwanza
mara <- x$mbbase_pg1_cover_page.dta$A01
mara <- ifelse(mara == "Mara", 1, 0)
demhhh$mara <- mara

#we only want the observations that were included in the allvar dataframe, which were the 434 observations from both surveys--so we use the argument all.x=T.
allvar <- merge(allvar,demhhh, all.x=T)

apply(allvar, 2, mean)




## ----propensity scores--------------------------------------------------------
#empty dataframe in whcih to put the coefficients
output <- data.frame(matrix(NA, nrow=431, ncol=9))
#list of variables we use as independent variables in these regressions
k <- colnames(belief)

#Here we create a loop which puts the coefficients of each regression into the output dataframe. 
for (i in seq_along(k)) {
  j <- k[i]
    fit <- glm(allvar[,j] ~ sex + age + edu + infants + noninfants + farmsize + labor + credit, family= binomial(link = "probit"), data = allvar)
  output[[i]] <- predict(fit, type = "response")
}

colnames(output) <- c("health.ate", "taste.ate", "yield.ate", "storability.ate", "sweetness.ate", "maturity.ate", "child.ate", "disease.ate", "color.ate")



## ----weights------------------------------------------------------------------
allvar$rownames <- rownames(allvar)
output$rownames <- rownames(output)
belief2 <- merge(allvar, output, by.x="rownames")

#for treatment: 1/output$health
#for control: 1/(1-output$health)
allvar$health.ate2 <- ifelse(belief2$health ==1, 1/belief2$health.ate, 1/(1-belief2$health.ate))
allvar$taste.ate2 <- ifelse(belief2$taste ==1, 1/belief2$taste.ate, 1/(1-belief2$taste.ate))
allvar$yield.ate2 <- ifelse(belief2$yield ==1, 1/belief2$yield.ate, 1/(1-belief2$yield.ate))
allvar$storability.ate2 <- ifelse(belief2$storability ==1, 1/belief2$storability.ate, 1/(1-belief2$storability.ate))
allvar$sweetness.ate2 <- ifelse(belief2$sweetness ==1, 1/belief2$sweetness.ate, 1/(1-belief2$sweetness.ate))
allvar$maturity.ate2 <- ifelse(belief2$maturity ==1, 1/belief2$maturity.ate, 1/(1-belief2$maturity.ate))
allvar$child.ate2 <- ifelse(belief2$child ==1, 1/belief2$child.ate, 1/(1-belief2$child.ate))
allvar$disease.ate2 <- ifelse(belief2$disease ==1, 1/belief2$disease.ate, 1/(1-belief2$disease.ate))
allvar$color.ate2 <- ifelse(belief2$color == 1, 1/belief2$color.ate, 1/(1-belief2$color.ate) )



## ----unweighted---------------------------------------------------------------
fit <- lm(OFSP ~ health + yield + storability + sweetness + maturity + child + disease + color,  data=allvar)
summary(fit)

fit2 <-lm(change ~ health + yield + storability + sweetness + maturity + child + disease + color,  data=allvar)
summary(fit2)



## ----scores-------------------------------------------------------------------
fitps <- lm(OFSP ~ health.ate2 + yield.ate2 + storability.ate2 + sweetness.ate2 + maturity.ate2 + child.ate2 + disease.ate2 + color.ate2, data=allvar)
summary(fitps)

fitps2 <- lm(change ~ health.ate2 + yield.ate2 + storability.ate2 + sweetness.ate2 + maturity.ate2 + child.ate2 + disease.ate2 + color.ate2, data=allvar)
summary(fitps2)



## -----------------------------------------------------------------------------
healthps <- glm(health ~ sex + age + edu + infants + noninfants + farmsize + labor + credit, family= binomial(), data=allvar)


tasteps <- glm(taste ~ sex + age + edu + infants + noninfants + farmsize + labor + credit, family= binomial(), data=allvar)

yieldps <- glm(yield ~ sex + age + edu + infants + noninfants + farmsize + labor + credit, family= binomial(), data=allvar)

storabilityps <- glm(storability ~ sex + age + edu + infants + noninfants + farmsize + labor + credit, family= binomial(), data=allvar)

sweetnessps <- glm(sweetness ~ sex + age + edu + infants + noninfants + farmsize + labor + credit, family= binomial(), data=allvar)

maturityps <- glm(maturity ~ sex + age + edu + infants + noninfants + farmsize + labor + credit, family= binomial(), data=allvar)

childps <- glm(child ~ sex + age + edu + infants + noninfants + farmsize + labor + credit, family= binomial(), data=allvar)

diseaseps <- glm(disease ~ sex + age + edu + infants + noninfants + farmsize + labor + credit, family= binomial(), data=allvar)

colorps <- glm(color ~ sex + age + edu + infants + noninfants + farmsize + labor + credit, family= binomial(), data=allvar)

allvar$health.ate <-  predict(healthps, type="response")
allvar$taste.ate <- predict(tasteps, type = "response")
allvar$yield.ate <- predict(yieldps, type = "response")
allvar$storability.ate <- predict(storabilityps, type = "response")
allvar$sweetness.ate <- predict(sweetnessps, type = "response")
allvar$maturity.ate <- predict(maturityps, type = "response")
allvar$childps.ate <- predict(childps, type = "response")
allvar$disease.ate <- predict(diseaseps, type = "response")
allvar$color.ate <- predict(colorps, type = "response")


#for control: 1/(1-output$health)
allvar$health.ate2 <- ifelse(allvar$health ==1, 1/allvar$health.ate, 1/(1-allvar$health.ate))
allvar$taste.ate2 <- ifelse(allvar$taste ==1, 1/allvar$taste.ate, 1/(1-allvar$taste.ate))
allvar$yield.ate2 <- ifelse(allvar$yield ==1, 1/allvar$yield.ate, 1/(1-allvar$yield.ate))
allvar$storability.ate2 <- ifelse(allvar$storability ==1, 1/allvar$storability.ate, 1/(1-allvar$storability.ate))
allvar$sweetness.ate2 <- ifelse(allvar$sweetness ==1, 1/allvar$sweetness.ate, 1/(1-allvar$sweetness.ate))
allvar$maturity.ate2 <- ifelse(allvar$maturity ==1, 1/allvar$maturity.ate, 1/(1-allvar$maturity.ate))
allvar$child.ate2 <- ifelse(allvar$child ==1, 1/allvar$child.ate, 1/(1-allvar$child.ate))
allvar$disease.ate2 <- ifelse(allvar$disease ==1, 1/allvar$disease.ate, 1/(1-allvar$disease.ate))
allvar$color.ate2 <- ifelse(allvar$color == 1, 1/allvar$color.ate, 1/(1-allvar$color.ate))


fitps <- lm(OFSP ~ health.ate2 + yield.ate2 + storability.ate2 + sweetness.ate2 + maturity.ate2 + child.ate2 + disease.ate2 + color.ate2, data=allvar)
summary(fitps)

fitps2 <- lm(change ~ health.ate2 + yield.ate2 + storability.ate2 + sweetness.ate2 + maturity.ate2 + child.ate2 + disease.ate2 + color.ate2, data=allvar)
summary(fitps2)