## ----Read in individual file-------------------------------------------------- datapath <- "D:/gdrive/share/agdatascience/data/PSI/Tanzania/2010/data" Part1 <- read.table(file.path(datapath, 'Part 1 Farmers identification and village characteristics_1.tab'), sep='\t', header=TRUE) #TODO: explain that the two dots ".." refer to the parent directory (folder) ## ----automatedread------------------------------------------------------------ #First set up the datapath that all files are stored in. datapath <-"../data/PSI/Tanzania/2010/data" #If you need to unzip the files, use the code below: #zf <- file.path(datapath, 'Sustainable Intensification Tanzania 2010 Data.zip') #unzip(zf, exdir=datapath) #This function lists all the .tab files in the datapath listed above. ff <- list.files(datapath, pattern='\\.tab$', full=TRUE) #Here we use lapply to read in multiple files at once. x <- lapply(ff, read.delim) ## ----nametables--------------------------------------------------------------- z <- strsplit(basename(ff), ' ') z <- t(sapply(z, function(x) x[1:4])) z[z[,3]!='Section', 4] <- "" z <- apply(z[,-3], 1, function(i) paste(i, collapse="")) names(x) <- z ## ----improvedVarieties-------------------------------------------------------- #To start a dataframe for just the dependent variables. dependent <- x$Part6A[, c('hhldid', 'cropvar1')] #We can use the summary function to see that the cropvar variable ranges from -13 to 92. summary(dependent$cropvar1) #If the varieties are above 15 or below 1, then they are not maize and given a value of 0. Otherwise, the varieties are maize and are given a value of 1. dependent$improvevar <- ifelse(dependent$cropvar1 > 0 & dependent$cropvar1 < 16, 1, 0) dependent$cropvar1 <- NULL ## ----Minimumtillage----------------------------------------------------------- notill <- x$Part6A[, c("nomitildrng", "nomitilbfre")] table(notill$nomitildrng, useNA="always") # only 0 and 1 are valid values, all others (one case of -77) are set to NA notill$nomitildrng[! notill$nomitildrng %in% c(0,1) ] <- NA table(notill$nomitilbfre, useNA = "always") #select either minimum till before or minimum till during. notill$notill <- pmin(1, notill$nomitilbfre + notill$nomitildrng) #add to our dependent dataframe dependent$notill <- notill$notill #The pmin function, or parallel min, ensures that the sum cannot be greater than 1. Our new column is equal to one if either (or both) of the two is equal to 1, and 0 otherwise. #TODO: explain the above elsewhere. ALso show as.integer(a | b) for 0 and 1 coded values. ## ----manureApplication-------------------------------------------------------- #sum manure bought and own manure used x$Part6B$manure <- x$Part6B$ownmanr + x$Part6B$bghtmanr #anything below or equal to 0 is NA, and quantities of manure greater than 0 are given value of 1. x$Part6B$manure <- ifelse(x$Part6B$manure <= 0, 0, 1) #Add manure to the dependent variable dataframe dependent$manure <- x$Part6B$manure ## ----waterConservation-------------------------------------------------------- watercons <- x$Part6A$soilwtrcnsrv1 #Lets take a look at the raw data for this variable: table(watercons, useNA="always") #Values below 0 are considered NA. Values above 0 are 1 watercons[watercons<0]<-NA watercons <- ifelse(watercons==1 | watercons==3 | watercons==7, 1, 0) #Add to the dependent variables dataframe. dependent$watercons <- watercons ## ----cropDiversity------------------------------------------------------------ #First take the column on intercrop for the original table intercrop <- x$Part6A$intercrp #Lets take a look at the variable: table(intercrop, useNA = "always") #Any value that is not 0 or 1 is NA intercrop[!intercrop %in% c(0,1) ] <- NA #Now we can see that variables have been moved to the NA category table(intercrop, useNA = "always") rotation <- x$Part6B[,c("cropgrwn1", "prevcrpgrwn1")] #A third column is created to determine if a different crop was grown the previous year. If this is true, then the plot is rotating. rotation$rotation <- rotation$cropgrwn1 != rotation$prevcrpgrwn1 #convert to numeric rotation$rotation <- as.numeric(rotation$rotation) #Next we add the intercrop variable to the rotation dataframe rotation$intercrop <- intercrop #add the two variables together. If still 0, then no crop diversity. If 1 or 2, then crop diversity rotation$diversity <- pmin(1, rotation$rotation+rotation$intercrop) #add to dependent variable dataframe dependent$diversity <- rotation$diversity ## ----InorganicFertilizer------------------------------------------------------ #adds up two different types of fertilizer fertilizer <- x$Part6B$amtplant + x$Part6B$amttopdr table(fertilizer, useNA="always") #Values less than 0 are NA fertilizer[fertilizer<0] <- NA fertilizer[fertilizer>0]<-1 #add to dependent variable dataframe dependent$fertilizer<-fertilizer ## ----allVars------------------------------------------------------------------ dependent$ID<-paste(x$Part6A$hhldid, x$Part6A$serialno, x$Part6A$season, x$Part6A$plotcode, x$Part6A$subplot, sep="") #change the order to match the paper dependentVar <- dependent[c("hhldid", "ID", "diversity", "notill", "improvevar", "fertilizer", "manure", "watercons")] #Mean of all columns except the first, which is household ID. NA variables are not included. colMeans(dependentVar[,-c(1:2)], na.rm=T) ## ----UniqueID----------------------------------------------------------------- #subset of the variables from the dataframe that are part of the plot characteristic variables analyzed in the paper allplot <- x$Part6A[,c("hhldid", "serialno", "season", "plotcode", "subplot", "subpltsize", "plotdist", "subpltown")] #create the unique plot ID by combining household ID and plot serial number allplot$ID <- paste(allplot$hhldid, allplot$serialno, allplot$season, allplot$plotcode, allplot$subplot, sep = "") #convert from character variable to numeric allplot$ID <- as.numeric(allplot$ID) #remove the two columns that are no longer needed, hhldid and serialno allplot <- allplot[,-(1:2)] n_occur <- data.frame(table(allplot$ID)) n_occur[n_occur$Freq > 1,] ## ----PlotOwnershipSizeDistance------------------------------------------------ table(allplot$subpltown) #Values not in the range asked by the survey are NA allplot$subpltown[! allplot$subpltown %in% c(1:5) ] <- NA #Remaining values between 2 and 5 are 0 allplot$subpltown[allplot$subpltown !=1] <- 0 #plotsize: here we convert size from acres to hectares to match allplot$subpltsize<-allplot$subpltsize*.404686 #plotdist summary(allplot$plotdist) #plot distances that are NA are coded as -99 or -77. To get average we need to convert these values to NA. allplot$plotdist[allplot$plotdist<0] <- NA ## ----Fertility---------------------------------------------------------------- #First, let's look at the data table(x$Part6A$soilfert) #medium fertility is coded as 2 allplot$medfertile <- ifelse(x$Part6A$soilfert != 2, 0, 1) #low fertility is coded as 3 allplot$lowfertile <- ifelse(x$Part6A$soilfert != 3, 0, 1) ## ----SoilSlope---------------------------------------------------------------- slope <- x$Part6A$soilslpe #First, let's look at the data table(slope) #We have five values of -77, so we will make these NA slope[slope < 0] <- NA #Values of 3 are steep slope allplot$steepslope <- ifelse(slope != 3, 0, 1) #Values of 2 are medium slope allplot$medslope <- ifelse(slope != 2, 0, 1) ## ----SoilDepth---------------------------------------------------------------- depth <- x$Part6A$soildpth #Let's look at the data table(depth) #convert negative values to NA depth[depth<0] <- NA #Values of 3 are deep depth allplot$deepdepth <- ifelse(depth != 3, 0, 1) #Values of 2 are medium depth allplot$moddepth <- ifelse(depth != 2, 0, 1) ## ----PestDisease-------------------------------------------------------------- risk <- x$Part6C[,c("hhldid", "serialno", "season", "plotcode", "subplot", "stress")] risk[risk<0] <- NA #creating the unique plot ID risk$ID <- paste(risk$hhldid, risk$serialno, risk$season, risk$plotcode, risk$subplot, sep = "") #converting the ID to numeric risk$ID <- as.numeric(risk$ID) #values of 1 and 2 are considered pest and disease risks, so all other values are converted to 0 table(risk$stress) risk$stress[risk$stress <0] <-NA risk$stress <- ifelse(risk$stress<1 |risk$stress > 2, 0, 1) #gets rid of the now unnecessary columns of hhldid and serialno. risk <- risk[,6:7] #Despite including all relevant inforomation, our id is still not unqiue. This likely explains why we have 44 additional observations in this table--they are not unique. #TODO: I'm not sure what to do about this... n_occur <- data.frame(table(risk$ID)) n_occur[n_occur$Freq > 1,] ## ----CombinecharacteristicsMean----------------------------------------------- #First create dataframe for all plot-level independent variables plotchar <- merge(allplot, risk, by = "ID", all.x=T) #All plot-level variables combined in single dataframe PlotVar <- merge(dependentVar, plotchar, by="ID", all.x=T) #finds the means of all columns of plot variables, not taking into consideration the first row or NA values. colMeans(PlotVar[,-c(1:2)], na.rm=T) ## ----Household Demographic Characterisitcs------------------------------------ #We want to only get a subset of the data which is the information on household heads HH <- x$Part2[x$Part2$relnhhead == 1,] #to see how many people per family, count the number of people with the same household ID, then make this into a data frame count <- as.data.frame(table(x$Part2$hhldid)) #and add this as a new column to household heads, called size for size of household HH$size <- count$Freq #there is one age recorded as -77, and we want to convert this to NA HH$age[HH$age == -77] <- NA #Then we take a subset from the HH dataframe, which only includes the variables analyzed in this paper HH <- HH[,c("hhldid", "sex", "age", "education", "size")] ## ----Salaried Employment------------------------------------------------------ intercrop[!intercrop %in% c(0,1) ] <- NA occupation<-x$Part2$mainoccup #The survey codes between 0 and 10 for occupation, so anything outside that range is NA occupation[!occupation %in% c(0:10)] <- NA # alternatively: occupation[occupation < 0 | occupation > 10] <- NA table(occupation, useNA= "always") #If not equal to 2, the dummy variable is value 0. If equal to 2, dummy variable value is 1 occupation <- ifelse(occupation != 2, 0, 1) #If no member of the household has salaried employment, the variable remains 0. Any value above 0 means that at least one member of the household has salaried employment. occupation <- as.data.frame(tapply(occupation, list(x$Part2$hhldid), FUN=sum)) #As in intro, if more than 1 member of the household has salaried employment, it is still considered value of 1 for dummy variable colnames(occupation) <- "occupation" occupation[occupation > 1] <- 1 #Add this variable to household dataframe HH$salary <- occupation$occupation ## ----Livestock---------------------------------------------------------------- livestock <- x$Part7A[,c("hhldid", "totno")] #If less than 0, considered NA and thus 0. Some values above 1000 appear to be either mistakes or outliers, and are converted to 0 livestock[livestock < 0 |livestock > 1000] <- NA #Because each household has multiple observations due to the different types of animals, this function allows us to sum total number of any type of livestock per household livestock <- tapply(livestock$totno, list(livestock$hhldid), FUN=sum) livestock <- as.data.frame(livestock) #add to household dataframe HH$livestock <- livestock$livestock ## ----Household Assets--------------------------------------------------------- value<-x$Part4A$totvalue #if less than 0, mark as NA value[value < 0] <- NA #to organize total assets by household ID asset <- tapply(value, list(x$Part4A$hhldid), FUN=sum) asset <- as.data.frame(asset) #to convert to USD from tanzania shilling asset <- asset*.0007 #add to household dataframe HH$asset <- asset$asset ## ----Farm Size---------------------------------------------------------------- plotsize <- x$Part6A$subpltsize #If below 0, considered NA plotsize[plotsize < 0] <- NA #Because some households have multiple plots, we want to know the average total farm size per household. This function adds up the plot sizes of each households. plotsize2 <- as.data.frame(tapply(plotsize, list(x$Part6A$hhldid), FUN=sum)) colnames(plotsize2) <- "size" #to convert to hectares plotsize2 <- plotsize2*0.404686 #add to household dataframe HH$plotsize <- plotsize2 ## ----AccesstoInformation------------------------------------------------------ #walking miniutes to market. If less than 0, NA walkingmin<-x$Part1$wlkminmn walkingmin[walkingmin < 0] <- NA HH$marketmin <- walkingmin #Distance to extension (min). If less than 0, NA exten <- x$Part1$wlkminag exten[exten < 0] <- NA HH$extenmin <- exten ## ----Confidence--------------------------------------------------------------- govtcon<-x$Part3B$gvtskillconf #Less than 0 is converted to NA govtcon[govtcon <= 0] <- NA #Convert to 0-1 dummy based on confidence level govtcon <- ifelse(govtcon < 4, 0, 1) #Add to HH dataframe HH$confidex <- govtcon ## ----credit------------------------------------------------------------------- credit <- x$Part9A[,c("hhldid", "needcrdt", "gotcrdt")] #This makes the credit variable equal to 1 if they both need credit and did not get it credit$credit <- ifelse(credit$needcrdt=="1" & credit$gotcrdt=="0", "1", "0") #IF na, converted to 0 credit$credit[is.na(credit$credit)] <- 0 credit$credit <- as.numeric(credit$credit) credit2 <- tapply(credit$credit, list(credit$hhldid), FUN=sum) credit2 <- as.data.frame(credit2) credit2[credit2 > 1] <- 1 HH$credit <- credit2$credit2 ## ----SocialCapital------------------------------------------------------------ cap<-x$Part3B$frndrltldrshp table(cap) #If less than 0 then given NA. Greater than 1 are mistakes, so also NA. cap[! cap %in% c(0,1)] <- NA #added to HH dataframe HH$leader <- cap ## ----MemberGroup-------------------------------------------------------------- group<-x$Part3A$typgrp table(group) #Because we have so many NAs, here we consider below or equal to 0 to be 0. If greater then 0, coded as 1 since it doesn't matter what type. group[group<=0]<-0 group[group>1]<-1 #If any member of the household is in the group, the dummy variable is given a value of 1. groupHH <- tapply(group, list(x$Part3A$hhldid), FUN=sum) #if more than one member is in a group, still 1 groupHH[groupHH > 1] <- 1 #add to HH dataframe HH$groups <- as.numeric(as.character(groupHH)) ## ----RelativesGrainTraders---------------------------------------------------- relative <- x$Part3B$rltwithvlg + x$Part3B$rltoutvlg relative[relative < 0] <- NA HH$relative <- relative trader <- x$Part3B$tradersinvlg + x$Part3B$tradersoutvlg trader[trader < 0] <- NA HH$trader <- trader ## ----TrustGovernment---------------------------------------------------------- trustgov <- x$Part3B$gvtskillconf trustgov[trustgov < 0] <-NA table(trustgov) trustgov <- ifelse(trustgov <= 3,0, 1) HH$trustgov <- trustgov ## ----SumHouseholdVariables---------------------------------------------------- #mean of all household variables summ <- HH[,-1] %>% colMeans(na.rm=T) %>% round(2) %>% as.data.frame() #same as:summ <- as.data.frame(round(colMeans(HH[,-1], na.rm=T), 2)) colnames(summ) <- "Mean" kable(summ) ## ----------------------------------------------------------------------------- #This function will only combine hhldid observations that exist in both PlotVar and HH AllVar <- merge(PlotVar, HH, by="hhldid") AllVar$ID <- as.numeric(AllVar$ID) AllVar <- na.omit(AllVar) ## ----correlation-------------------------------------------------------------- #to compute correlation coefficient between all variables cor <- cor(AllVar[,-c(1:2)], use="complete.obs") library(knitr) #only plot a few of the correlations, since there are so many variables kable(cor[1:8,1:8]) #to compute the correlation coefficient between just two specific variables cor(AllVar$manure, AllVar$livestock, use="complete.obs") ## ----plots-------------------------------------------------------------------- library(ggplot2) #We also use the tidyverse package for pipes library(tidyverse) library(dplyr) graph <- data.frame(AllVar$livestock, as.factor(AllVar$manure)) colnames(graph) <- c("livestock", "manure") #to compare the two groups, we make two levels from this factor variable levels(graph$manure) <- c("no.manure", "manure") graph %>% filter(is.na(manure)==FALSE)%>% ggplot(aes(y=livestock, x=manure)) + geom_boxplot() ## ----number of SIPs adopted--------------------------------------------------- adopt <-AllVar[,1:8] #first lets combine plot level information on adoption into household level adopt <- aggregate(adopt[,3:8], by=list(adopt$hhldid), FUN=sum) adopt[,2:7][adopt[,2:7]>1] <- 1 #Then we add a column that tells us how many SIPs the household adopted adopt <- mutate(adopt, sum = diversity + notill + improvevar + fertilizer + manure + watercons) #plot figure 2, which tells us how many of the six SIPs were adopted by percentage of households ggplot(adopt, aes(adopt$sum, y=..prop..))+ geom_bar(color="blue", fill="darkblue") + ggtitle("SIPs adoption intensity") + xlab("Number of SIPs") + ylab("Adoption rate, %") ## ----yield by SIPs------------------------------------------------------------ #to find average plot size for maize per household size <-x$Part6A[x$Part6A$cropgrwn1 == "1",c(1,22)] size[size<0] <- NA #to get total area per household size <- aggregate(size$areashare1, list(size$hhldid), sum) colnames(size) <- c("hhldid", "plotsize") yield <- x$Part6D[x$Part6D$cropgrwn==1,] #we only care about yield, so we extract production from column 8 yield <- yield[,c(1,8)] #add the information on plot size to the production yield <- left_join(yield, size, by="hhldid") yield$yield <- yield$prodn/yield$plotsize #next, add the column for yield to the adopt dataframe (here we use dplyr package but you could also use merge) adopt <- left_join(adopt, yield, by=c("Group.1" = "hhldid")) #then, we can plot the number of SIPs adopted against yield ggplot(adopt, aes(adopt$sum, adopt$yield)) + geom_point() ## ----------------------------------------------------------------------------- simple <- lm(manure ~ livestock, data= AllVar) summary(simple) multlinear <- lm(manure ~ livestock + plotdist, data = AllVar) summary(multlinear) ## ----------------------------------------------------------------------------- probit <- glm(manure ~ livestock + plotdist, family = binomial(link = "probit"), data = AllVar) summary(probit) logit <- glm(manure ~ livestock + plotdist, family = binomial(link = "logit"), data = AllVar) summary(logit) ## ----------------------------------------------------------------------------- library(MASS) lda <- lda(manure ~ livestock + plotdist, data = AllVar) lda ## ---- eval = FALSE------------------------------------------------------------ ## #convert to boolean and matrix ## dept <- as.matrix(AllVar[,c(3,4,7)]>0) ## ## #library(mvProbit) ## mvprobit <- mvProbit(cbind(diversity, notill, manure) ~ livestock + plotdist + subpltown + sex + age + education + size, data = cbind(dept, AllVar)) ## summary(mvprobit)