## ----agro--------------------------------------------------------------------- ff <- agro::get_data_from_uri("hdl:11529/10754", ".") head(ff) length(ff) ## ----read2010----------------------------------------------------------------- #Creates a list of files with the pattern .tab ff <- grep('\\.tab$', ff, value=TRUE) #Use lapply to read in each file from the above list x <- lapply(ff, read.delim) #The next section cleans up the names of each of the tables that were read in above. z <- strsplit(basename(ff), ' ') z <- t(sapply(z, function(x) x[1:4])) z[z[,3]!='Section', 4] <- "" z <- apply(z[,-3], 1, function(i) paste(i, collapse="")) names(x) <- z ## ----districts---------------------------------------------------------------- #create a factor of district district <- as.factor(x$Part0$district2) #Name each level according to the district name, for clarity. levels(district) <- c("Karatu", "Mbulu", "Mvomero", "Kilosa") summary(district) ## ----Table2-SexAgeSizeEducation----------------------------------------------- #To only get a subset of the data which is the information on household heads HH <- x$Part2[x$Part2$relnhhead=="1",] #Add district information to the household head data HH$district <- as.factor(x$Part0$district2) levels(HH$district) <- c("Karatu", "Mbulu", "Mvomero", "Kilosa") #Household Size: count <- as.data.frame(table(x$Part2$hhldid)) #Add this as a new column to household heads HH$size <- count$Freq #Household Age: there is one age recorded as -77, we convert this to NA HH$age[HH$age==-77] <- NA #Use aggregate to find the mean of multiple columns (sex, age, education, and size) in the data frame by district table <- aggregate(HH[,c(3,6, 19, 7)], list(HH$district), mean, na.rm=T) table[,2:5] <- round(table[,2:5], 2) #transpose so that it matches the table in the report tablet <- t(table) tabled <- as.data.frame(tablet[-1,]) # the first row will be the header colnames(tabled) <- c("Karatu", "Mbulu", "Mvomero", "Kilosa") tabled$Average <- apply(na.omit(HH[,c(3,6, 19, 7)]), 2, mean) tabled[,5] <- round(tabled[,5], 2) rownames(tabled) <-c("Male headed household", "Age of Household Head (years)", "Household Size (number)", "Education of Household Head (years)") library(knitr) kable(tabled) ## ----MainOccupation----------------------------------------------------------- library(dplyr) occupation <- as.factor(HH$mainoccup) levels(occupation) = c("Other", "Farming(crop+livestock)", "Salaried Employment", "Self-employed off-farm", "Casual labourer off-farm", "Other", "Other") #this gives us a count occupationtab <- as.data.frame(table(occupation, district)) occupationtab <- reshape(occupationtab, timevar="district", idvar="occupation", direction= "wide") occupationtab <- occupationtab[,-1] #To cacluate percentages of each column, we can use the mutate function in the dplyr package. occupationtab <- mutate(occupationtab, Karatu = Freq.Karatu / sum(Freq.Karatu), Mbulu = Freq.Mbulu / sum(Freq.Mbulu), Mvomero = Freq.Mvomero/sum(Freq.Mvomero), Kilosa = Freq.Kilosa/sum(Freq.Kilosa)) #Remove count values and round occupationtab <- round(occupationtab[,5:8],2) #name the rows rownames(occupationtab) <-levels(occupation) #To get number of individuals dedicated to each occupation from all households occupationall <- table(occupation) occupationall <- round((occupationall/710)*100, 1) occupationtab$Average <- occupationall #To change the order to match the chart occupationtab <- occupationtab[c(2,3,4,5,1), ] #To display the chart kable(occupationtab) ## ----AccesstoHealthServices--------------------------------------------------- #Extract all variables into single dataframe health <- x$Part1[,c("dtshlthc", "wlkminhl", "dstmnsrc", "wlkminsrcdrwtr")] health[health<0] <- NA #add district information health$district <- as.factor(x$Part0$district2 ) levels(health$district) <- c("Karatu", "Mbulu", "Mvomero", "Kilosa") #Use aggregate to see on average the access to health services by district healtht <- aggregate(health[,1:4], list(district), mean, na.rm=T) healtht[,2:5] <- round(healtht[,2:5], 2) healtht <- t(healtht) colnames(healtht) <- healtht[1,] healtht <- healtht[-1,] healtht <- as.data.frame(healtht) #Next, we get the health variables for all the observations together totalh <-as.data.frame(apply(na.omit(health[1:4]), 2, mean)) healtht$Average <- round(totalh[,1], 2) rownames(healtht) <- c("Distance to health center", "Time to health center", "Distance to drinking water", "Time to drinking water") #To view the dataframe kable(healtht) ## ----agro2-------------------------------------------------------------------- library(agro) ff <- get_data_from_uri("hdl:11529/10755", ".") head(ff) length(ff) ## ----Read2013Data------------------------------------------------------------- ff2 <- grep('\\.tab$', ff, value=TRUE) x2 <- lapply(ff2, read.delim) #To name each of the tables z2 <- strsplit(basename(ff2), " |-") z2 <- t(sapply(z2, function(x2) x2[1:4])) z2[1,3] <- "Part" z2[1, 4] <- "A" z2[2,3] <- "Part" z2[2,4] <- "B" z2[z2[,3]!='Part', 4] <- "" z2 <- apply(z2[,-3], 1, function(i) paste(i, collapse="")) names(x2) <- z2 ## ----AttritionRate------------------------------------------------------------ #the fist section creates a matrix that tells us the number of people interviewed by district district13 <- as.factor(x2$Module1B$district2) levels(district13) <- c("Karatu", "Mbulu", "Mvomero", "Kilosa") district13.count <- as.matrix(summary(district13), byrow=T) #then I do the same for the 2010 data district10 <- as.factor(x$Part0$district2) levels(district10) <- c("Karatu", "Mbulu", "Mvomero", "Kilosa") district10.count <- as.matrix(summary(district10), byrow=T) #combine the two matrices into one table attrition <- as.data.frame(cbind(district13.count, district10.count)) #add the column names colnames(attrition) <- c("N_2010", "N_2013") #formula to add column for attrition rate attrition$percent_attrition <- (((attrition$N_2010-attrition$N_2013)/attrition$N_2010)*100) #rounding attrition$percent_attrition <- round(attrition$percent_attrition, 2) knitr::kable(attrition) ## ----Table2AgeEducationSizeDistrict, eval=F----------------------------------- ## #same as 2010 data, I want to only see data on household heads ## HH2 <- x2$Module2A[x2$Module2A$relnhhead == 1,] ## ## #RH error here ## HH2$district <- as.factor(x2$Module1B$district2) ## levels(HH2$district) <- c("Karatu", "Mbulu", "Mvomero", "Kilosa") ## ## count2 <- as.data.frame(table(x2$Module2A$hhldid)) ## #again, to get a count of number of people per household ## HH2$size <- count2$Freq ## ## table2 <- aggregate(HH2[,c(3, 6, 8, 32)], list(HH2$district), mean, na.rm=T) ## table2[,2:5] <- round(table2[,2:5], 2) ## ## tablet2 <- t(table2) ## tabled2 <- as.data.frame(tablet2) ## colnames(tabled2) <- c("Karatu", "Mbulu", "Mvomero", "Kilosa") # the first row will be the header ## tabled2 <- tabled2[-1, ] ## tabled2 ## ----DemographicsGender, eval=F----------------------------------------------- ## HH2$district <- x2$Module1B$district2 ## table3<-aggregate(HH2[,c(6, 8, 32)], list(HH2$sex), mean, na.rm=T) ## table3[,2:4] <- round(table3[,2:4], 2) ## tablet3 <- t(table3) ## tabled3 <- as.data.frame(tablet3[-1,]) ## colnames(tabled3) <- c("Female", "Male") ## tabled3 ## ----FoodSecurityFig12-------------------------------------------------------- #View(x2$Module9) #To get one response per household foodin <- x2$Module9[x2$Module9$qnntype == 1, c("chronic", "transitory", "breakeven", "surplus")] #Use`apply` to count the number of individuals in each category slices <- apply(foodin, 2, sum) #we can use this information to create the pie chart pct <- round(slices/sum(slices)*100) #create labels lbls <- paste(colnames(foodin), pct, "%", sep = " ") color <- c("darkblue", "red", "lightgreen", "purple") pie(slices, labels = lbls, col = color, main = "Overall Household Food Security") ## ----FoodSecurityGender------------------------------------------------------- #extract information on food security from household heads foodin2 <- x2$Module9[x2$Module9$qnntype == 1, c(21, 24, 25, 26, 27)] gender <- as.factor(foodin2$sexhhead) levels(gender) <- c("Female", "Male") #food security by gender tables <- aggregate(foodin2, list(gender), sum, na.rm=T) foodsec<-as.matrix(tables[,3:6]) #The next code creates a new matrix that is the proportion of total for each cateogry of food security Diagonal_Matrix <- diag(1/rowSums(foodsec)) #Then, we can multiply the proportion of each catogory by the number in each category to get percent food secure in each category foodsec.pct <- Diagonal_Matrix %*% foodsec #Here we add the total food security status, without gender foodsec.pct <- rbind(foodsec.pct, pct/100) #To change order of rows to match foodsec.pct <- foodsec.pct[c(2,1,3),] rownames(foodsec.pct) <- c("Male", "Female", "Total") #To get total food security for male, female, and both overall <- cbind(foodin2[,2]+foodin2[,3], foodin2[,4]+foodin2[,5]) colnames(overall) <- c("insecure", "secure") overall <- data.frame(overall) overall$gender <- gender ag <- aggregate(overall[,1:2], list(gender), sum, na.rm=T) ag$total <- ag$insecure + ag$secure #first row is male secure, second is female secure, third is total secure overall.list <-as.data.frame(c((ag[2,3]/ag[2,4]), (ag[1,3]/ag[1,4]), ((ag[1,3]+ag[2,3])/(ag[1,4]+ag[2,4])) )) colnames(overall.list) <- "Overall" foodsec.pct <- cbind(foodsec.pct, overall.list) foodsec.pct <- as.matrix(foodsec.pct) #Add the barplot barplot(foodsec.pct, beside=T, main="Food Security by Gender", col=c("darkblue", "darkred", "darkgreen")) legend("topleft", rownames(foodsec.pct), fill=c("darkblue", "darkred", "darkgreen")) #And, to see the values: kable(foodsec.pct) ## ----agro3-------------------------------------------------------------------- library(agro) ff <- get_data_from_uri("hdl:11529/11128", ".") head(ff) length(ff) ## ----read2015----------------------------------------------------------------- #For the .tab files: ff3 <- grep('\\.tab$', ff, value=TRUE) x3 <- lapply(ff3, read.delim) z3 <- strsplit(basename(ff3), ' ') z3 <- t(sapply(z3, function(x3) x3[1:4])) z3[1,4] <- "A1" z3[2,4] <- "A2" z3[3,4] <- "A" z3[z3[,3]!='Part', 4] <- "" z3 <- apply(z3[,-3], 1, function(i) paste(i, collapse="")) names(x3) <- z3 #For the .dta files: library(foreign) ff4 <- grep('\\.dta$', ff, value=TRUE) x4 <- lapply(ff4, read.dta) z4 <- strsplit(basename(ff4), ' ') z4 <- t(sapply(z4, function(x4) x4[1:4])) #This particular table had multiple parts combined, and is simplier without the commas. z4[13,4] <- "BCD" z4[z4[,3] != 'Part', 4] <- "" z4 <- apply(z4[,-3], 1, function(i) paste(i, collapse="")) names(x4) <- z4 #To combine the .tab and .dta files into a single object x3 <- append(x3, x4) ## ----Attrition2015------------------------------------------------------------ #There are 26 NA entries at the end of this table, so we wish to remove them. HH3 <- x3$Module2A2[!is.na(x3$Module2A2$relnhhead ),] HH3$realid <- HH3$hhldid comb <- merge(HH3, HH, by="hhldid") HH3$district <- comb$district district15 <- as.factor(HH3$district) district15.count <- as.matrix(summary(district15), byrow=T) attrition <- as.data.frame(cbind(district13.count, district10.count, district15.count)) #add the column names colnames(attrition) <- c("N_2010", "N_2013", "N_2015") #formula to add column for attrition rate attrition$attrition_since_2010 <- (((attrition$N_2010-attrition$N_2015)/attrition$N_2010)*100) attrition$attrition_since_2013 <- (((attrition$N_2013-attrition$N_2015)/attrition$N_2010)*100) #rounding attrition$attrition_since_2010 <- round(attrition$attrition_since_2010, digits=2) attrition$attrition_since_2013 <- round(attrition$attrition_since_2013, digits=2) kable(attrition) ## ----FoodSecurityStatus------------------------------------------------------- foodin15 <- x3$MODULE9[x3$MODULE9$Respondent_type=="Primary (main respondent)",] slices15 <- summary(na.omit(foodin15$M9_PA_15)) lbls <- c("chronic", "transitory", "breakeven", "surplus") pct15 <- round(slices15/sum(slices15)*100) lbls <- paste(lbls, pct15, "%",sep=" ") color <- c("darkblue", "red", "lightgreen", "purple") pie(slices15, labels=lbls, col=color, main= "Overall Household Food Security 2015") ## ----FoodSecurityPanel-------------------------------------------------------- foodsec10 <- x$Part1$fmlfoodc #The 2010 food security data is coded as numbers from 1-4. Numbers outside this range are errors and are NA. foodsec10[foodsec10>4 | foodsec10<1] <- NA k <- as.data.frame(table(foodsec10)) slices10 <- k$Freq pct10 <- round(slices10/sum(slices10)*100) pctcomb <- data.frame(pct10, pct, pct15) pctcomb <- t(pctcomb) colnames(pctcomb) <- c("chronic", "transitory", "breakeven", "surplus") rownames(pctcomb) <- c("2010", "2013", "2015") barplot(pctcomb, beside=T, legend.text=row.names(pctcomb), col=c('darkred', 'darkblue', 'darkgreen'), main="Food Security Status 2010-2015") ## ----fertilizer--------------------------------------------------------------- #2010 fertilizer fert10 <- x$Part6B[x$Part6B$cropgrwn1 == 1,c("hhldid", "subplot", "amtplant", "cstpltfert", "amttopdr", "csttopdr")] fert10[fert10 < 0] <- NA fert10$quanttotal10 <- fert10$amtplant + fert10$amttopdr fert10$fert10 <- pmin(1, fert10$amtplant + fert10$amttopdr) summary(fert10$fert10) fert10hh <- aggregate(fert10$fert10, by= list(fert10$hhldid), sum) colnames(fert10hh) <- c("hhldid", "fert10") fert10hh$fert10[fert10hh$fert10 > 1] <- 1 #2013 fertilizer fert13 <- x2$Module3A1[x2$Module3A1$cropgrwn1 == 1, c("hhldid", "plotcode", "amtplant", "cstpltfert", "amttopdr", "csttopdr", "methpayfert")] fert13[fert13 < 0] <- NA fert13$quanttotal13 <- fert13$amtplant + fert13$amttopdr fert13$fert13 <- pmin(1, fert13$amtplant + fert13$amttopdr) summary(fert13$fert13) fert13hh <- aggregate(fert13$fert13, by= list(fert13$hhldid), sum) colnames(fert13hh) <- c("hhldid", "fert13") fert13hh$fert13[fert13hh$fert13 > 1] <- 1 fert15 <- x3$Module3A[x3$Module3A$cropgrwn1 == 1,c("hhldid", "plotcode", "amtplant", "cstpltfert", "amttopdr", "csttopdr", "methpayfert")] fert15[fert15 < 0] <- NA fert15$quanttotal15 <- fert15$amtplant + fert15$amttopdr fert15$fert15 <- pmin(1, fert15$amtplant + fert15$amttopdr) summary(fert15$fert15) fert15hh <- aggregate(fert15$fert15, by= list(fert15$hhldid), sum) colnames(fert15hh) <- c("hhldid", "fert15") fert15hh$fert15[fert15hh$fert15 > 1] <- 1 ## ----allyears----------------------------------------------------------------- fertall <- merge(fert10hh, fert13hh, all.x = TRUE, all.y = TRUE) fertall <- merge(fertall, fert15hh, all.x = TRUE, all.y = TRUE) increase <- fertall$fert15 > fertall$fert10 summary(increase) decrease <- fertall$fert15 < fertall$fert10 summary(decrease) #how many years did the household use fertilizer fertall$sum <- as.factor(fertall$fert10 + fertall$fert13 + fertall$fert15) #did the household use fertilizer in any of the three years? fertall$any <- pmin(1, fertall$fert10 + fertall$fert13 + fertall$fert15) #We can see that the highest portion of households used fertilizer in 2013 summary(fertall[,c(2:4)])