## ----agro---------------------------------------------------------------------
ff <- agro::get_data_from_uri("hdl:11529/10754", ".")
head(ff)
length(ff)


## ----read2010-----------------------------------------------------------------
#Creates a list of files with the pattern .tab
ff <- grep('\\.tab$', ff, value=TRUE)
#Use lapply to read in each file from the above list
x <- lapply(ff, read.delim)

#The next section cleans up the names of each of the tables that were read in above.
z <- strsplit(basename(ff), ' ')
z <- t(sapply(z, function(x) x[1:4]))
z[z[,3]!='Section', 4] <- ""
z <- apply(z[,-3], 1, function(i) paste(i, collapse=""))
names(x) <- z



## ----districts----------------------------------------------------------------
#create a factor of district
district <- as.factor(x$Part0$district2) 
#Name each level according to the district name, for clarity.
levels(district) <- c("Karatu", "Mbulu", "Mvomero", "Kilosa")
summary(district)


## ----Table2-SexAgeSizeEducation-----------------------------------------------
#To only get a subset of the data which is the information on household heads
 HH <- x$Part2[x$Part2$relnhhead=="1",] 
#Add district information to the household head data
HH$district <- as.factor(x$Part0$district2)
levels(HH$district) <- c("Karatu", "Mbulu", "Mvomero", "Kilosa") 
#Household Size:
count <- as.data.frame(table(x$Part2$hhldid))
#Add this as a new column to household heads
HH$size <- count$Freq 
 
#Household Age: there is one age recorded as -77, we convert this to NA
HH$age[HH$age==-77] <- NA 

#Use aggregate to find the mean of multiple columns (sex, age, education, and size) in the data frame by district
table <- aggregate(HH[,c(3,6, 19, 7)], list(HH$district), mean, na.rm=T) 
table[,2:5] <- round(table[,2:5], 2)
#transpose so that it matches the table in the report
tablet <- t(table) 
tabled <- as.data.frame(tablet[-1,])
# the first row will be the header 
colnames(tabled) <- c("Karatu", "Mbulu", "Mvomero", "Kilosa") 

tabled$Average  <- apply(na.omit(HH[,c(3,6, 19, 7)]), 2, mean)
tabled[,5] <- round(tabled[,5], 2)

rownames(tabled) <-c("Male headed household", "Age of Household Head (years)", "Household Size (number)", "Education of Household Head (years)")
library(knitr)
kable(tabled)


## ----MainOccupation-----------------------------------------------------------
library(dplyr)
occupation <- as.factor(HH$mainoccup)
levels(occupation) = c("Other", "Farming(crop+livestock)", "Salaried Employment", "Self-employed off-farm", "Casual labourer off-farm", "Other", "Other") 
 #this gives us a count

occupationtab <-  as.data.frame(table(occupation, district))
occupationtab <- reshape(occupationtab, timevar="district", idvar="occupation", direction= "wide") 
occupationtab <- occupationtab[,-1]

#To cacluate percentages of each column, we can use the mutate function in the dplyr package.
occupationtab <- mutate(occupationtab, 
                Karatu = Freq.Karatu / sum(Freq.Karatu),
                Mbulu = Freq.Mbulu / sum(Freq.Mbulu),
                Mvomero = Freq.Mvomero/sum(Freq.Mvomero),
                Kilosa = Freq.Kilosa/sum(Freq.Kilosa))

#Remove count values and round
occupationtab <- round(occupationtab[,5:8],2)
#name the rows
rownames(occupationtab) <-levels(occupation)

#To get number of individuals dedicated to each occupation from all households
occupationall <- table(occupation)
occupationall <- round((occupationall/710)*100, 1)
occupationtab$Average <- occupationall
#To change the order to match the chart
occupationtab <- occupationtab[c(2,3,4,5,1), ]
#To display the chart
kable(occupationtab)



## ----AccesstoHealthServices---------------------------------------------------
#Extract all variables into single dataframe
health <- x$Part1[,c("dtshlthc", "wlkminhl", "dstmnsrc", "wlkminsrcdrwtr")]
health[health<0] <- NA
#add district information
health$district <- as.factor(x$Part0$district2 )
levels(health$district) <- c("Karatu", "Mbulu", "Mvomero", "Kilosa")

#Use aggregate to see on average the access to health services by district
healtht <- aggregate(health[,1:4], list(district), mean, na.rm=T)
healtht[,2:5] <- round(healtht[,2:5], 2)

healtht <- t(healtht)
colnames(healtht) <- healtht[1,]
healtht <- healtht[-1,]
healtht <- as.data.frame(healtht)

#Next, we get the health variables for all the observations together
totalh  <-as.data.frame(apply(na.omit(health[1:4]), 2, mean))
healtht$Average <- round(totalh[,1], 2)
rownames(healtht) <- c("Distance to health center", "Time to health center", "Distance to drinking water", "Time to drinking water")
#To view the dataframe
kable(healtht)


## ----agro2--------------------------------------------------------------------
library(agro)
ff <- get_data_from_uri("hdl:11529/10755", ".")
head(ff)
length(ff)


## ----Read2013Data-------------------------------------------------------------
ff2 <- grep('\\.tab$', ff, value=TRUE)
x2 <- lapply(ff2, read.delim)

#To name each of the tables
z2 <- strsplit(basename(ff2), " |-")
z2 <- t(sapply(z2, function(x2) x2[1:4]))
z2[1,3] <- "Part"
z2[1, 4] <- "A"
z2[2,3] <- "Part"
z2[2,4] <- "B"
z2[z2[,3]!='Part', 4] <- ""
z2 <- apply(z2[,-3], 1, function(i) paste(i, collapse=""))
names(x2) <- z2


## ----AttritionRate------------------------------------------------------------
#the fist section creates a matrix that tells us the number of people interviewed by district
district13 <- as.factor(x2$Module1B$district2)
levels(district13) <- c("Karatu", "Mbulu", "Mvomero", "Kilosa")
district13.count <- as.matrix(summary(district13), byrow=T)

#then I do the same for the 2010 data
district10 <- as.factor(x$Part0$district2)
levels(district10) <- c("Karatu", "Mbulu", "Mvomero", "Kilosa")
district10.count <- as.matrix(summary(district10), byrow=T)

#combine the two matrices into one table
attrition <- as.data.frame(cbind(district13.count, district10.count))
#add the column names
colnames(attrition) <- c("N_2010", "N_2013") 
#formula to add column for attrition rate
attrition$percent_attrition <- (((attrition$N_2010-attrition$N_2013)/attrition$N_2010)*100) 
#rounding
attrition$percent_attrition <- round(attrition$percent_attrition, 2) 
knitr::kable(attrition) 



## ----Table2AgeEducationSizeDistrict, eval=F-----------------------------------
## #same as 2010 data, I want to only see data on household heads
## HH2 <- x2$Module2A[x2$Module2A$relnhhead == 1,]
## 
## #RH error here
## HH2$district <- as.factor(x2$Module1B$district2)
## levels(HH2$district) <- c("Karatu", "Mbulu", "Mvomero", "Kilosa")
## 
## count2 <- as.data.frame(table(x2$Module2A$hhldid))
## #again, to get a count of number of people per household
## HH2$size <- count2$Freq
## 
## table2 <- aggregate(HH2[,c(3, 6, 8, 32)], list(HH2$district), mean, na.rm=T)
## table2[,2:5] <- round(table2[,2:5], 2)
## 
## tablet2 <- t(table2)
## tabled2 <- as.data.frame(tablet2)
## colnames(tabled2) <- c("Karatu", "Mbulu", "Mvomero", "Kilosa")  # the first row will be the header
## tabled2 <- tabled2[-1, ]
## tabled2


## ----DemographicsGender, eval=F-----------------------------------------------
## HH2$district <- x2$Module1B$district2
## table3<-aggregate(HH2[,c(6, 8, 32)], list(HH2$sex), mean, na.rm=T)
## table3[,2:4] <- round(table3[,2:4], 2)
## tablet3 <- t(table3)
## tabled3 <- as.data.frame(tablet3[-1,])
## colnames(tabled3) <- c("Female", "Male")
## tabled3


## ----FoodSecurityFig12--------------------------------------------------------
#View(x2$Module9)
#To get one response per household
foodin <- x2$Module9[x2$Module9$qnntype == 1, c("chronic", "transitory", "breakeven", "surplus")]
#Use`apply` to count the number of individuals in each category
slices <- apply(foodin, 2, sum)

#we can use this information to create the pie chart
pct <- round(slices/sum(slices)*100)
#create labels
lbls <- paste(colnames(foodin), pct, "%", sep = " ") 
color <- c("darkblue", "red", "lightgreen", "purple")
pie(slices, labels = lbls, col = color, main = "Overall Household Food Security")


## ----FoodSecurityGender-------------------------------------------------------
#extract information on food security from household heads
foodin2 <- x2$Module9[x2$Module9$qnntype == 1, c(21, 24, 25, 26, 27)]
gender <- as.factor(foodin2$sexhhead)
levels(gender) <- c("Female", "Male")
#food security by gender
tables <- aggregate(foodin2, list(gender), sum, na.rm=T)
foodsec<-as.matrix(tables[,3:6])

#The next code creates a new matrix that is the proportion of total for each cateogry of food security
Diagonal_Matrix <- diag(1/rowSums(foodsec))
#Then, we can multiply the proportion of each catogory by the number in each category to get percent food secure in each category
foodsec.pct <- Diagonal_Matrix %*% foodsec
#Here we add the total food security status, without gender
foodsec.pct <- rbind(foodsec.pct, pct/100)
#To change order of rows to match
foodsec.pct <- foodsec.pct[c(2,1,3),]
rownames(foodsec.pct) <- c("Male", "Female", "Total")

#To get total food security for male, female, and both
overall <- cbind(foodin2[,2]+foodin2[,3], foodin2[,4]+foodin2[,5])
colnames(overall) <- c("insecure", "secure")
overall <- data.frame(overall)
overall$gender <- gender
ag <- aggregate(overall[,1:2], list(gender), sum, na.rm=T)
ag$total <- ag$insecure + ag$secure
#first row is male secure, second is female secure, third is total secure
overall.list <-as.data.frame(c((ag[2,3]/ag[2,4]), (ag[1,3]/ag[1,4]), ((ag[1,3]+ag[2,3])/(ag[1,4]+ag[2,4])) ))
colnames(overall.list) <- "Overall"
foodsec.pct <- cbind(foodsec.pct, overall.list)
foodsec.pct <- as.matrix(foodsec.pct)

#Add the barplot
barplot(foodsec.pct, beside=T, main="Food Security by Gender", col=c("darkblue", "darkred", "darkgreen"))
legend("topleft", rownames(foodsec.pct), fill=c("darkblue", "darkred", "darkgreen"))
#And, to see the values:
kable(foodsec.pct)


## ----agro3--------------------------------------------------------------------
library(agro)
ff <- get_data_from_uri("hdl:11529/11128", ".")
head(ff)
length(ff)


## ----read2015-----------------------------------------------------------------
#For the .tab files: 
ff3 <- grep('\\.tab$', ff, value=TRUE)
x3 <- lapply(ff3, read.delim)

z3 <- strsplit(basename(ff3), ' ')
z3 <- t(sapply(z3, function(x3) x3[1:4]))
z3[1,4] <- "A1"
z3[2,4] <- "A2"
z3[3,4] <- "A"
z3[z3[,3]!='Part', 4] <- ""
z3 <- apply(z3[,-3], 1, function(i) paste(i, collapse=""))
names(x3) <- z3

#For the .dta files:
library(foreign)
ff4 <- grep('\\.dta$', ff, value=TRUE)
x4 <- lapply(ff4, read.dta)
z4 <- strsplit(basename(ff4), ' ')
z4 <- t(sapply(z4, function(x4) x4[1:4]))
#This particular table had multiple parts combined, and is simplier without the commas. 
z4[13,4] <- "BCD"
z4[z4[,3] != 'Part', 4] <- ""
z4 <- apply(z4[,-3], 1, function(i) paste(i, collapse=""))
names(x4) <- z4

#To combine the .tab and .dta files into a single object
x3 <- append(x3, x4)


## ----Attrition2015------------------------------------------------------------
#There are 26 NA entries at the end of this table, so we wish to remove them. 
HH3 <- x3$Module2A2[!is.na(x3$Module2A2$relnhhead ),]
HH3$realid <- HH3$hhldid
comb <- merge(HH3, HH, by="hhldid")
HH3$district <- comb$district
district15 <- as.factor(HH3$district)
district15.count <- as.matrix(summary(district15), byrow=T)
attrition <- as.data.frame(cbind(district13.count, district10.count, district15.count))

#add the column names
colnames(attrition) <- c("N_2010", "N_2013", "N_2015") 
#formula to add column for attrition rate
attrition$attrition_since_2010 <- (((attrition$N_2010-attrition$N_2015)/attrition$N_2010)*100) 
attrition$attrition_since_2013 <- (((attrition$N_2013-attrition$N_2015)/attrition$N_2010)*100)
#rounding
attrition$attrition_since_2010 <- round(attrition$attrition_since_2010, digits=2) 
attrition$attrition_since_2013 <- round(attrition$attrition_since_2013, digits=2) 

kable(attrition) 


## ----FoodSecurityStatus-------------------------------------------------------
foodin15 <- x3$MODULE9[x3$MODULE9$Respondent_type=="Primary (main respondent)",]
slices15 <- summary(na.omit(foodin15$M9_PA_15))
lbls <- c("chronic", "transitory", "breakeven", "surplus")
pct15 <- round(slices15/sum(slices15)*100)
lbls <- paste(lbls, pct15, "%",sep=" ")
color <- c("darkblue", "red", "lightgreen", "purple")
pie(slices15, labels=lbls, col=color, main= "Overall Household Food Security 2015")



## ----FoodSecurityPanel--------------------------------------------------------
foodsec10 <- x$Part1$fmlfoodc
#The 2010 food security data is coded as numbers from 1-4. Numbers outside this range are errors and are NA.
foodsec10[foodsec10>4 | foodsec10<1] <- NA
k <- as.data.frame(table(foodsec10))
slices10 <- k$Freq
pct10 <- round(slices10/sum(slices10)*100)

pctcomb <- data.frame(pct10, pct, pct15)
pctcomb <- t(pctcomb)
colnames(pctcomb) <- c("chronic", "transitory", "breakeven", "surplus")
rownames(pctcomb) <- c("2010", "2013", "2015") 
barplot(pctcomb, beside=T, legend.text=row.names(pctcomb), col=c('darkred', 'darkblue', 'darkgreen'), main="Food Security Status 2010-2015") 


## ----fertilizer---------------------------------------------------------------
#2010 fertilizer
fert10 <- x$Part6B[x$Part6B$cropgrwn1 == 1,c("hhldid", "subplot", "amtplant", "cstpltfert", "amttopdr", "csttopdr")]
fert10[fert10 < 0] <- NA
fert10$quanttotal10 <- fert10$amtplant + fert10$amttopdr
fert10$fert10 <- pmin(1, fert10$amtplant + fert10$amttopdr)
summary(fert10$fert10)

fert10hh <- aggregate(fert10$fert10, by= list(fert10$hhldid), sum)
colnames(fert10hh) <- c("hhldid", "fert10")
fert10hh$fert10[fert10hh$fert10 > 1] <- 1

#2013 fertilizer
fert13 <- x2$Module3A1[x2$Module3A1$cropgrwn1 == 1, c("hhldid", "plotcode", "amtplant", "cstpltfert", "amttopdr", "csttopdr", "methpayfert")]
fert13[fert13 < 0] <- NA
fert13$quanttotal13 <- fert13$amtplant + fert13$amttopdr
fert13$fert13 <- pmin(1, fert13$amtplant + fert13$amttopdr)
summary(fert13$fert13)

fert13hh <- aggregate(fert13$fert13, by= list(fert13$hhldid), sum)
colnames(fert13hh) <- c("hhldid", "fert13")
fert13hh$fert13[fert13hh$fert13 > 1] <- 1

fert15 <- x3$Module3A[x3$Module3A$cropgrwn1 == 1,c("hhldid", "plotcode", "amtplant", "cstpltfert", "amttopdr", "csttopdr", "methpayfert")]
fert15[fert15 < 0] <- NA
fert15$quanttotal15 <- fert15$amtplant + fert15$amttopdr
fert15$fert15 <- pmin(1, fert15$amtplant + fert15$amttopdr)
summary(fert15$fert15)

fert15hh <- aggregate(fert15$fert15, by= list(fert15$hhldid), sum)
colnames(fert15hh) <- c("hhldid", "fert15")
fert15hh$fert15[fert15hh$fert15 > 1] <- 1


## ----allyears-----------------------------------------------------------------
fertall <- merge(fert10hh, fert13hh, all.x = TRUE, all.y = TRUE)
fertall <- merge(fertall, fert15hh, all.x = TRUE, all.y = TRUE)

increase <- fertall$fert15 > fertall$fert10
summary(increase)
decrease <-  fertall$fert15 < fertall$fert10
summary(decrease)

#how many years did the household use fertilizer
fertall$sum <- as.factor(fertall$fert10 + fertall$fert13 + fertall$fert15)
#did the household use fertilizer in any of the three years?
fertall$any <- pmin(1, fertall$fert10 + fertall$fert13 + fertall$fert15)

#We can see that the highest portion of households used fertilizer in 2013
summary(fertall[,c(2:4)])