## ----sampindia01-------------------------------------------------------------- x <- read.csv("PC_2011.csv", stringsAsFactors = FALSE) head(x) dim(x) table(x$Level) ## ----sampindia10-------------------------------------------------------------- villages <- x[x$Level == "VILLAGE" & x$TRU == "Rural", ] dim(villages) ## ----sampindia100------------------------------------------------------------- set.seed(352020) samplesize <- 40 i <- sample(nrow(villages), samplesize, prob=villages$No_HH) sort(i) sel_villages <- villages[i, ] sort(sel_villages$Name) ## ----sampindia20-------------------------------------------------------------- subdist <- unique(x[x$Level == "SUB-DISTRICT", c("Subdistt", "Name")]) colnames(subdist)[2] <- "Subdistrict" head(subdist) ## ----sampindia201------------------------------------------------------------- sel_villages <- merge(sel_villages, subdist, by="Subdistt") head(sel_villages) ## ----sampindia30-------------------------------------------------------------- sel_villages <- sel_villages[, c("State", "District", "Subdistt", "Town.Village", "Subdistrict", "Name", "No_HH")] sel_villages <- sel_villages[order(sel_villages$Subdistrict, sel_villages$Name), ] ## ----pdf---------------------------------------------------------------------- #library(pdftools) voterfile <- "Bariyarpur-1.pdf" # read the file s <- pdftools::pdf_text(voterfile) class(s) length(s) ## ----sampvoter01-------------------------------------------------------------- housepattern = "गतह सपखयच : " namepattern = "ननरचरचक कच नचम : " ss <- trimws(unlist(strsplit(s, "\r\n"))) i <- grep(paste0("^", namepattern), ss) si <- trimws(unlist(strsplit(ss[i], namepattern))) j <- grep(paste0("^", housepattern), ss) sj <- trimws(unlist(strsplit(ss[j], housepattern))) # this should be TRUE (length(si) == length(sj)) x <- cbind(sj, si) x <- x[x[,1] != "", ] colnames(x) <- c("household", "name") ## ----xxx---------------------------------------------------------------------- hn <- unique(x) head(hn) ## ----sampvoter20-------------------------------------------------------------- uhh <- unique(hn[, "household"]) head(uhh) #hns <- sample(uhh, 15) #hns ## ----sampvoter30-------------------------------------------------------------- #x <- hn[hn[, "household"] %in% hns, ] #y <- tapply(x[,2], x[,1], function(i) paste(i, collapse=", ")) #z <- cbind(house=names(y), names=as.vector(y)) ## ----sampvoter40-------------------------------------------------------------- #knitr::kable(z[1:5,]) ## ----------------------------------------------------------------------------- #write.csv(z, "selection.csv", row.names = FALSE)