## ----Read in individual file--------------------------------------------------
datapath <- "D:/gdrive/share/agdatascience/data/PSI/Tanzania/2010/data"
Part1 <- read.table(file.path(datapath, 'Part 1 Farmers identification and village characteristics_1.tab'), sep='\t', header=TRUE)

#TODO: explain that the two dots ".." refer to the parent directory (folder)


## ----automatedread------------------------------------------------------------
#First set up the datapath that all files are stored in.
datapath <-"../data/PSI/Tanzania/2010/data"
#If you need to unzip the files, use the code below: 
#zf <- file.path(datapath, 'Sustainable Intensification Tanzania 2010 Data.zip')
#unzip(zf, exdir=datapath)
#This function lists all the .tab files in the datapath listed above.
ff <- list.files(datapath, pattern='\\.tab$', full=TRUE)
#Here we use lapply to read in multiple files at once. 
x <- lapply(ff, read.delim)


## ----nametables---------------------------------------------------------------
z <- strsplit(basename(ff), ' ')
z <- t(sapply(z, function(x) x[1:4]))
z[z[,3]!='Section', 4] <- ""
z <- apply(z[,-3], 1, function(i) paste(i, collapse=""))
names(x) <- z


## ----improvedVarieties--------------------------------------------------------
#To start a dataframe for just the dependent variables.  
dependent <- x$Part6A[, c('hhldid', 'cropvar1')]

#We can use the summary function to see that the cropvar variable ranges from -13 to 92. 
summary(dependent$cropvar1)

#If the varieties are above 15 or below 1, then they are not maize and given a value of 0. Otherwise, the varieties are maize and are given a value of 1.
dependent$improvevar <- ifelse(dependent$cropvar1 > 0 & dependent$cropvar1 < 16, 1, 0) 

dependent$cropvar1 <- NULL


## ----Minimumtillage-----------------------------------------------------------
notill <- x$Part6A[, c("nomitildrng", "nomitilbfre")]

table(notill$nomitildrng, useNA="always")
# only 0 and 1 are valid values, all others (one case of -77) are set to NA
notill$nomitildrng[! notill$nomitildrng %in% c(0,1) ] <- NA
table(notill$nomitilbfre, useNA = "always")


#select either minimum till before or minimum till during. 
notill$notill <- pmin(1, notill$nomitilbfre + notill$nomitildrng)
#add to our dependent dataframe
dependent$notill <- notill$notill
#The pmin function, or parallel min, ensures that the sum cannot be greater than 1. Our new column is equal to one if either (or both) of the two is equal to 1, and 0 otherwise. 
#TODO: explain the above elsewhere. ALso show as.integer(a | b) for 0 and 1 coded values. 


## ----manureApplication--------------------------------------------------------
#sum manure bought and own manure used
x$Part6B$manure <- x$Part6B$ownmanr + x$Part6B$bghtmanr
#anything below or equal to 0 is NA, and quantities of manure greater than 0 are given value of 1. 
x$Part6B$manure <- ifelse(x$Part6B$manure <= 0, 0, 1)
#Add manure to the dependent variable dataframe
dependent$manure <- x$Part6B$manure 


## ----waterConservation--------------------------------------------------------
watercons <- x$Part6A$soilwtrcnsrv1
#Lets take a look at the raw data for this variable: 
table(watercons, useNA="always")
#Values below 0 are considered NA. Values above 0 are 1
watercons[watercons<0]<-NA
watercons <- ifelse(watercons==1 | watercons==3 | watercons==7, 1, 0)
#Add to the dependent variables dataframe.
dependent$watercons <- watercons


## ----cropDiversity------------------------------------------------------------
#First take the column on intercrop for the original table
intercrop <- x$Part6A$intercrp
#Lets take a look at the variable: 
table(intercrop, useNA = "always")
#Any value that is not 0 or 1 is NA
intercrop[!intercrop %in% c(0,1) ] <- NA
#Now we can see that variables have been moved to the NA category
table(intercrop, useNA = "always")

rotation <- x$Part6B[,c("cropgrwn1", "prevcrpgrwn1")]

#A third column is created to determine if a different crop was grown the previous year. If this is true, then the plot is rotating.
 rotation$rotation <- rotation$cropgrwn1 != rotation$prevcrpgrwn1 

#convert to numeric 
rotation$rotation <- as.numeric(rotation$rotation)

#Next we add the intercrop variable to the rotation dataframe
rotation$intercrop <- intercrop
#add the two variables together. If still 0, then no crop diversity. If 1 or 2, then crop diversity
rotation$diversity <- pmin(1, rotation$rotation+rotation$intercrop)
#add to dependent variable dataframe
dependent$diversity <- rotation$diversity


## ----InorganicFertilizer------------------------------------------------------
#adds up two different types of fertilizer
fertilizer <- x$Part6B$amtplant + x$Part6B$amttopdr 
table(fertilizer, useNA="always")
#Values less than 0 are NA
fertilizer[fertilizer<0] <- NA
fertilizer[fertilizer>0]<-1
#add to dependent variable dataframe
dependent$fertilizer<-fertilizer



## ----allVars------------------------------------------------------------------
dependent$ID<-paste(x$Part6A$hhldid, x$Part6A$serialno,  x$Part6A$season,  x$Part6A$plotcode,  x$Part6A$subplot, sep="")
 #change the order to match the paper
dependentVar <- dependent[c("hhldid", "ID", "diversity", "notill", "improvevar", "fertilizer", "manure", "watercons")]
#Mean of all columns except the first, which is household ID. NA variables are not included. 
colMeans(dependentVar[,-c(1:2)], na.rm=T) 


## ----UniqueID-----------------------------------------------------------------
#subset of the variables from the dataframe that are part of the plot characteristic variables analyzed in the paper
allplot <- x$Part6A[,c("hhldid", "serialno", "season", "plotcode", "subplot", "subpltsize", "plotdist", "subpltown")] 
#create the unique plot ID by combining household ID and plot serial number
allplot$ID <- paste(allplot$hhldid, allplot$serialno, allplot$season, allplot$plotcode, allplot$subplot, sep = "") 
#convert from character variable to numeric
allplot$ID <- as.numeric(allplot$ID) 
#remove the two columns that are no longer needed, hhldid and serialno
allplot <- allplot[,-(1:2)] 


n_occur <- data.frame(table(allplot$ID))
n_occur[n_occur$Freq > 1,]


## ----PlotOwnershipSizeDistance------------------------------------------------

table(allplot$subpltown)
#Values not in the range asked by the survey are NA
allplot$subpltown[! allplot$subpltown %in% c(1:5) ] <- NA
#Remaining values between 2 and 5 are 0
allplot$subpltown[allplot$subpltown !=1] <- 0 

#plotsize: here we convert size from acres to hectares to match
allplot$subpltsize<-allplot$subpltsize*.404686 

#plotdist
summary(allplot$plotdist)
#plot distances that are NA are coded as -99 or -77. To get average we need to convert these values to NA.
allplot$plotdist[allplot$plotdist<0] <- NA 


## ----Fertility----------------------------------------------------------------
#First, let's look at the data
table(x$Part6A$soilfert)
#medium fertility is coded as 2
allplot$medfertile <- ifelse(x$Part6A$soilfert != 2, 0, 1)
#low fertility is coded as 3
allplot$lowfertile <- ifelse(x$Part6A$soilfert != 3, 0, 1)


## ----SoilSlope----------------------------------------------------------------
slope <- x$Part6A$soilslpe
#First, let's look at the data
table(slope)
#We have five values of -77, so we will make these NA
slope[slope < 0] <- NA
#Values of 3 are steep slope
allplot$steepslope <- ifelse(slope != 3, 0, 1)
#Values of 2 are medium slope
allplot$medslope <- ifelse(slope != 2, 0, 1)


## ----SoilDepth----------------------------------------------------------------
depth <- x$Part6A$soildpth
#Let's look at the data
table(depth)
#convert negative values to NA
depth[depth<0] <- NA
#Values of 3 are deep depth
allplot$deepdepth <- ifelse(depth != 3, 0, 1)
#Values of 2 are medium depth
allplot$moddepth <- ifelse(depth != 2, 0, 1)


## ----PestDisease--------------------------------------------------------------
risk <- x$Part6C[,c("hhldid", "serialno", "season", "plotcode", "subplot",  "stress")] 
risk[risk<0] <- NA
#creating the unique plot ID
risk$ID <- paste(risk$hhldid, risk$serialno, risk$season, risk$plotcode, risk$subplot, sep = "") 
#converting the ID to numeric
risk$ID <- as.numeric(risk$ID) 
#values of 1 and 2 are considered pest and disease risks, so all other values are converted to 0
table(risk$stress)

risk$stress[risk$stress <0] <-NA
risk$stress <- ifelse(risk$stress<1 |risk$stress > 2, 0, 1)
#gets rid of the now unnecessary columns of hhldid and serialno.
risk <- risk[,6:7] 

#Despite including all relevant inforomation, our id is still not unqiue. This likely explains why we have 44 additional observations in this table--they are not unique. 
#TODO: I'm not sure what to do about this...
n_occur <- data.frame(table(risk$ID))
n_occur[n_occur$Freq > 1,]


## ----CombinecharacteristicsMean-----------------------------------------------
#First create dataframe for all plot-level independent variables
plotchar <- merge(allplot, risk, by = "ID", all.x=T) 
#All plot-level variables combined in single dataframe
PlotVar <- merge(dependentVar, plotchar, by="ID", all.x=T) 
#finds the means of all columns of plot variables, not taking into consideration the first row or NA values. 
colMeans(PlotVar[,-c(1:2)], na.rm=T) 


## ----Household Demographic Characterisitcs------------------------------------
#We want to only get a subset of the data which is the information on household heads
HH <- x$Part2[x$Part2$relnhhead == 1,]
#to see how many people per family, count the number of people with the same household ID, then make this into a data frame
count <- as.data.frame(table(x$Part2$hhldid))
#and add this as a new column to household heads, called size for size of household
HH$size <- count$Freq 
#there is one age recorded as -77, and we want to convert this to NA
HH$age[HH$age == -77] <- NA 
#Then we take a subset from the HH dataframe, which only includes the variables analyzed in this paper
HH <- HH[,c("hhldid", "sex", "age", "education", "size")]



## ----Salaried Employment------------------------------------------------------
intercrop[!intercrop %in% c(0,1) ] <- NA
occupation<-x$Part2$mainoccup
#The survey codes between 0 and 10 for occupation, so anything outside that range is NA
occupation[!occupation %in% c(0:10)] <- NA
# alternatively: occupation[occupation < 0 | occupation > 10] <- NA
table(occupation, useNA= "always")
#If not equal to 2, the dummy variable is value 0. If equal to 2, dummy variable value is 1
occupation <- ifelse(occupation != 2, 0, 1)
#If no member of the household has salaried employment, the variable remains 0. Any value above 0 means that at least one member of the household has salaried employment.
occupation <- as.data.frame(tapply(occupation, list(x$Part2$hhldid), FUN=sum))
 #As in intro, if more than 1 member of the household has salaried employment, it is still considered value of 1 for dummy variable
colnames(occupation) <- "occupation"
occupation[occupation > 1] <- 1
#Add this variable to household dataframe
HH$salary <- occupation$occupation


## ----Livestock----------------------------------------------------------------
livestock <- x$Part7A[,c("hhldid", "totno")]
#If less than 0, considered NA and thus 0. Some values above 1000 appear to be either mistakes or outliers, and are converted to 0
livestock[livestock < 0 |livestock > 1000] <- NA 
#Because each household has multiple observations due to the different types of animals, this function allows us to sum total number of any type of livestock per household
livestock <- tapply(livestock$totno, list(livestock$hhldid), FUN=sum)
livestock <- as.data.frame(livestock)
#add to household dataframe
HH$livestock <- livestock$livestock


## ----Household Assets---------------------------------------------------------
value<-x$Part4A$totvalue
#if less than 0, mark as NA
value[value < 0] <- NA 
#to organize total assets by household ID
asset <- tapply(value, list(x$Part4A$hhldid), FUN=sum) 
asset <- as.data.frame(asset)
#to convert to USD from tanzania shilling
asset <- asset*.0007 
#add to household dataframe
HH$asset <- asset$asset 



## ----Farm Size----------------------------------------------------------------
plotsize <- x$Part6A$subpltsize
#If below 0, considered NA 
plotsize[plotsize < 0] <- NA 
#Because some households have multiple plots, we want to know the average total farm size per household. This function adds up the plot sizes of each households. 
plotsize2 <- as.data.frame(tapply(plotsize, list(x$Part6A$hhldid), FUN=sum))
colnames(plotsize2) <- "size"
#to convert to hectares
plotsize2 <- plotsize2*0.404686 
#add to household dataframe
HH$plotsize <- plotsize2


## ----AccesstoInformation------------------------------------------------------
#walking miniutes to market. If less than 0, NA
walkingmin<-x$Part1$wlkminmn
walkingmin[walkingmin < 0] <- NA 
HH$marketmin <- walkingmin

#Distance to extension (min). If less than 0, NA
exten <- x$Part1$wlkminag
exten[exten < 0] <- NA 
HH$extenmin <- exten


## ----Confidence---------------------------------------------------------------
govtcon<-x$Part3B$gvtskillconf
#Less than 0 is converted to NA
govtcon[govtcon <= 0] <- NA 
#Convert to 0-1 dummy based on confidence level
govtcon <- ifelse(govtcon < 4, 0, 1)
 #Add to HH dataframe
HH$confidex <- govtcon


## ----credit-------------------------------------------------------------------
credit <- x$Part9A[,c("hhldid", "needcrdt", "gotcrdt")] 
#This makes the credit variable equal to 1 if they both need credit and did not get it
credit$credit <- ifelse(credit$needcrdt=="1" & credit$gotcrdt=="0", "1", "0")  
#IF na, converted to 0
credit$credit[is.na(credit$credit)] <- 0 
credit$credit <- as.numeric(credit$credit)
credit2 <- tapply(credit$credit, list(credit$hhldid), FUN=sum)  
credit2 <- as.data.frame(credit2)
credit2[credit2 > 1] <- 1
HH$credit <- credit2$credit2


## ----SocialCapital------------------------------------------------------------
cap<-x$Part3B$frndrltldrshp
table(cap)
#If less than 0 then given NA. Greater than 1 are mistakes, so also NA.
cap[! cap %in% c(0,1)] <- NA
#added to HH dataframe
HH$leader <- cap


## ----MemberGroup--------------------------------------------------------------
group<-x$Part3A$typgrp 
table(group)
#Because we have so many NAs, here we consider below or equal to 0 to be 0. If greater then 0, coded as 1 since it doesn't matter what type.
group[group<=0]<-0
group[group>1]<-1

#If any member of the household is in the group, the dummy variable is given a value of 1.
groupHH <- tapply(group, list(x$Part3A$hhldid), FUN=sum)  
#if more than one member is in a group, still 1
groupHH[groupHH > 1] <- 1 
 #add to HH dataframe
HH$groups <- as.numeric(as.character(groupHH))


## ----RelativesGrainTraders----------------------------------------------------
relative <- x$Part3B$rltwithvlg + x$Part3B$rltoutvlg
relative[relative < 0] <- NA 
HH$relative <- relative

trader <- x$Part3B$tradersinvlg + x$Part3B$tradersoutvlg
trader[trader < 0] <- NA 
HH$trader <- trader


## ----TrustGovernment----------------------------------------------------------
trustgov <- x$Part3B$gvtskillconf
trustgov[trustgov < 0] <-NA
table(trustgov)
trustgov <- ifelse(trustgov <= 3,0, 1)
HH$trustgov <- trustgov


## ----SumHouseholdVariables----------------------------------------------------
#mean of all household variables
summ <- HH[,-1] %>%
  colMeans(na.rm=T) %>%
  round(2) %>%
  as.data.frame()
#same as:summ <- as.data.frame(round(colMeans(HH[,-1], na.rm=T), 2))

colnames(summ) <- "Mean"
kable(summ)


## -----------------------------------------------------------------------------
#This function will only combine hhldid observations that exist in both PlotVar and HH
AllVar <- merge(PlotVar, HH, by="hhldid")
AllVar$ID <- as.numeric(AllVar$ID)
AllVar <- na.omit(AllVar)


## ----correlation--------------------------------------------------------------
#to compute correlation coefficient between all variables 
cor <- cor(AllVar[,-c(1:2)], use="complete.obs")
library(knitr)
#only plot a few of the correlations, since there are so many variables
kable(cor[1:8,1:8])
#to compute the correlation coefficient between just two specific variables
cor(AllVar$manure, AllVar$livestock, use="complete.obs")



## ----plots--------------------------------------------------------------------
library(ggplot2)
#We also use the tidyverse package for pipes
library(tidyverse)
library(dplyr)

graph <- data.frame(AllVar$livestock, as.factor(AllVar$manure))
colnames(graph) <- c("livestock", "manure")
#to compare the two groups, we make two levels from this factor variable
levels(graph$manure) <-  c("no.manure", "manure")


graph %>% 
  filter(is.na(manure)==FALSE)%>%
  ggplot(aes(y=livestock, x=manure)) +
  geom_boxplot()




## ----number of SIPs adopted---------------------------------------------------
adopt <-AllVar[,1:8]
#first lets combine plot level information on adoption into household level
adopt <- aggregate(adopt[,3:8], by=list(adopt$hhldid), FUN=sum)
adopt[,2:7][adopt[,2:7]>1] <- 1
#Then we add a column that tells us how many SIPs the household adopted
adopt <- mutate(adopt, sum = diversity + notill + improvevar + fertilizer + manure + watercons)

#plot figure 2, which tells us how many of the six SIPs were adopted by percentage of households
ggplot(adopt, aes(adopt$sum, y=..prop..))+
  geom_bar(color="blue", fill="darkblue") +
  ggtitle("SIPs adoption intensity") +
  xlab("Number of SIPs") +
  ylab("Adoption rate, %")


## ----yield by SIPs------------------------------------------------------------
#to find average plot size for maize per household
size <-x$Part6A[x$Part6A$cropgrwn1 == "1",c(1,22)]
size[size<0] <- NA
#to get total area per household
size <- aggregate(size$areashare1, list(size$hhldid), sum)
colnames(size) <- c("hhldid", "plotsize")

yield <- x$Part6D[x$Part6D$cropgrwn==1,]
#we only care about yield, so we extract production from column 8
yield <- yield[,c(1,8)]
#add the information on plot size to the production 
yield <- left_join(yield, size, by="hhldid")
yield$yield <- yield$prodn/yield$plotsize

#next, add the column for yield to the adopt dataframe (here we use dplyr package but you could also use merge)
adopt <- left_join(adopt, yield, by=c("Group.1" = "hhldid"))

#then, we can plot the number of SIPs adopted against yield
ggplot(adopt, aes(adopt$sum, adopt$yield)) +
  geom_point()


## -----------------------------------------------------------------------------
simple <- lm(manure ~ livestock, data= AllVar)
summary(simple)
  
multlinear <- lm(manure ~ livestock + plotdist, data = AllVar)
summary(multlinear)


## -----------------------------------------------------------------------------
probit <- glm(manure ~ livestock + plotdist, family = binomial(link = "probit"), data = AllVar)
summary(probit)
logit <- glm(manure ~ livestock + plotdist, family = binomial(link = "logit"), data = AllVar)
summary(logit)


## -----------------------------------------------------------------------------
library(MASS)
lda <- lda(manure ~ livestock + plotdist, data = AllVar)
lda


## ---- eval = FALSE------------------------------------------------------------
## #convert to boolean and matrix
## dept <- as.matrix(AllVar[,c(3,4,7)]>0)
## 
## #library(mvProbit)
## mvprobit <- mvProbit(cbind(diversity, notill, manure) ~ livestock + plotdist + subpltown + sex + age + education  + size, data = cbind(dept, AllVar))
## summary(mvprobit)