# Running the multi-level IPF algorithm(s) of Kirril Mueller. rm(list=ls()) logfile <- paste("./", gsub("-", "", gsub(":", "", gsub(" ", "_", Sys.time()))), ".log", sep="") # Convert the arguments. args <- commandArgs(trailingOnly=T) working.directory <- args[1] area <- args[2] number.of.cores <- as.integer(args[3]) only.first <- args[4] cat("Arguments passed:\n", file=logfile, append=F) cat(sprintf(fmt=" Working directory: %s\n", working.directory), file=logfile, append=T) cat(sprintf(fmt=" Area: %s\n", area), file=logfile, append=T) cat(sprintf(fmt=" Number of cores: %d\n", number.of.cores), file=logfile, append=T) cat(sprintf(fmt=" Run only first zone: %s\n\n", only.first), file=logfile, append=T) rm(args) setwd(dir=working.directory) library(data.table) seed=201406 # Installing the necessary libraries. setupIPF <- function(){ library(devtools) if(!require(package="kimisc", character.only=T)){ install.packages('kimisc') if(!require(package="kimisc", character.only=T)){ stop("Package 'kimisc' not found") } } if(!require(package="MultiLevelIPF", character.only=T)){ install_github("MultiLevelIPF", "krlmlr") if(!require(package="MultiLevelIPF", character.only=T)){ stop("Package 'MultiLevelIPF' not found") } } devtools::install_github("krlmlr/MultiLevelIPF@dfsane") library(MultiLevelIPF) } setupIPF() cat(sprintf(fmt=" Version of MultiLevelIPF package used: %s\n\n", package_version("MultiLevelIPF"))) cat(paste("Running Multi-level IPF on", area, "\n\n"), file=logfile, append=T) # Establish an area folder, and stop if it already exists!! outputfolder = paste("./", area, sep="") if( file.exists(outputfolder) ){ stop(paste("The output folder", outputfolder, "already exists and may NOT be overwritten!!")) } dir.create(path=outputfolder) # Check that the reference sample does not contain variables NOT covered in # the control totals. cat(sprintf(fmt="Copying reference sample for %s...\n", area), file=logfile, append=T) ref <- read.table(file=paste("./data2011/", area, "_R.dat", sep=""), header=T, sep="\t") kill <- unique(ref[ref$POP > 4,1]) ref <- ref[!ref$HHNR %in% kill, ] rm(kill) # Read the control totals file. cat(sprintf(fmt="Performing ml_ipf for each subplace...\n", area), file=logfile, append=T) ct <- read.table(file=paste("./data2011/", area, "_C.dat", sep=""), header=T, sep="\t") ct[ct == 0.001] <- 0 # Set up the parallel infrastructure. library(foreach) library(doMC) registerDoMC() options(cores=number.of.cores) # Execute the fitting algorithm, in parallel, for each row (zone). if(only.first){ zone.range <- 1 } else{ zone.range <- nrow(ct) } results.per.zone <- foreach(i=1:zone.range, .combine=rbind) %dopar% { zone <- ct[i,1] # Only consider this zone if there are actually people in it. That means, if # EITHER the group control totals OR the individual control totals have no # number greater than zero, the zone is ignored. if(sum(ct[i, 2:9] > 0) == 0 | sum(ct[i, 10:21] > 0) == 0){ cat(sprintf(fmt=" %03d) %d... SKIPPED\n", i, zone), file=logfile, append=T) # Return the result of the zone data.frame(zone=zone, success=FALSE, residual=NA) } else{ # Set up a folder for each thread/zone zoneFolder <- paste(outputfolder, "/", zone, sep="") dir.create(zoneFolder) # Move the cleaned up reference sample to the zone's folder. write.table(x=ref, file=paste(zoneFolder, "/REF.dat", sep=""), sep="\t", row.names=F, col.names=T, append=F, quote=F) # Copy the config file into the folder. copy.config <- file.copy(from="./config.xml", to=paste(zoneFolder, "/config.xml", sep="")) # Create the individual control totals, namely for race and gender. ind <- data.frame(POP=c(1,2,3,4,1,2,3,4), GEN=rep(x=c(1,2), each=4)) totals <- t(as.vector(ct[i,2:9])) ind$N <- totals write.table(x=ind, file=paste(zoneFolder, "/IC.dat", sep=""), sep="\t", append=F, quote=F, row.names=F, col.names=T) # Create the group control totals, namely household income grp <- data.frame(INC=1:12) totals <- t(as.vector(ct[i,10:21])) grp$N <- totals write.table(x=grp, file=paste(zoneFolder, "/GC.dat", sep=""), sep="\t", append=F, quote=F, row.names=F, col.names=T) # Import the required IPF data, and remove the files afterwards. import <- import_IPAF_results(path=zoneFolder, config_name="config.xml") ref.deleted <- file.remove(paste(zoneFolder, "/REF.dat", sep="")) ic.deleted <- file.remove(paste(zoneFolder, "/IC.dat", sep="")) gc.deleted <- file.remove(paste(zoneFolder, "/GC.dat", sep="")) gc.deleted <- file.remove(paste(zoneFolder, "/config.xml", sep="")) dir.deleted <- file.remove(zoneFolder) # Run the multi-level IPF. ml.fit <- ml_fit(ref_sample=import$refSample, controls=import$controls, field_names=import$fieldNames, verbose=T, dfsane_args=list( control=list(tol=1e-3, maxit=10000) ) ) # Whether successful or not, write weights, sample households, and write # their members. Just note the convergence status in the filename. if(ml.fit$success){ status <- "SUCCESS" postfix <- "" } else{ status <- "FAILED" postfix <- paste(" (residual: ", ml.fit$bbout$residual, ")", sep="") } set.seed(seed=seed) # Write the fitted weights to file. ref$IPF_WEIGHT <- ml.fit$weights write.table(x=ml.fit$weights, file=paste(outputfolder, "/", zone, "_", status, "_weights.dat", sep=""), quote=F, row.names=F, col.names=F) unique.households <- ref[!duplicated(ref$HHNR), ] household.ids <- unique.households$HHNR household.probs <- unique.households$IPF_WEIGHT/sum(unique.households$IPF_WEIGHT) sampled.households <- sample(x=1:length(household.ids), size=sum(grp$N), replace=T, prob=household.probs) households.sampled <- household.ids[sampled.households] ref.table <- data.table(ref) ref.table$HHNR <- as.character(ref$HHNR) setkey(ref.table, HHNR) persons.sampled <- ref.table[as.character(households.sampled), allow.cartesian=T] write.table(x=persons.sampled, file=paste(outputfolder, "/", zone, "_", status, "_persons.dat", sep=""), quote=F, row.names=F, col.names=T) # Return the result of the zone cat(sprintf(fmt=" %003d) %d... %s%s\n", i, zone, status, postfix), file=logfile, append=T) data.frame(zone=zone, success=ml.fit$success, residual=ml.fit$bbout$residual) } } # Write output to file. write.table(x=results.per.zone, file=paste(outputfolder, "/", area, ".txt", sep=""), sep="\t", col.names=T, row.names=F, quote=F)