library(permute) ShunkLength <- 5000 ## length suffeled shunks of original data dat <- scan(file="ALL.chr1.merged_beagle_mach.20101123.snps_indels_svs.genotypes_mat.txt") haploN <- as.integer(dat[1]) snps <- as.integer(dat[2]) Isnps <- vector("integer",haploN) Ipos <- list() j=3 k=1 for (i in 1:haploN) { Lsnps <- as.integer(dat[j]) Isnps[k] <- Lsnps Ipos[[k]] <- as.integer(dat[(j+1):(j+Lsnps)]) j <- j+2*Lsnps+1 k <- k+1 } annot <- read.table(paste("ALL.chr1.merged_beagle_mach.20101123.snps_indels_svs.genotypes_annot.txt",sep=""),header = FALSE, sep = " ", quote = "",as.is = TRUE,skip=2) for (i in 3:9) { annot[[i]] <- gsub(",",";",annot[[i]]) } for (i in 4:5) { annot[[i]] <- gsub("<","A",annot[[i]]) annot[[i]] <- gsub(">","A",annot[[i]]) } pos <- annot[,2] ref <- substr(annot[,4],1,1) alt <- substr(annot[,5],1,1) Dpos <- c(sample(50:150,1),diff(pos)) startS <- c(1) endS <- c() k <- 2 i1 <- 1 oldStart <- k - 1 goOn <- TRUE while (goOn) { tt1 <- pos[oldStart] while ((k<=snps)&&(pos[k]-tt1 < ShunkLength)) { k <- k+1 } if (k>=snps) { endS <- c(endS,snps) goOn <- FALSE } else { Tend <- max((k-1),oldStart+10) endS <- c(endS,Tend) k <- Tend+2 i1 <- i1+1 oldStart <- k-1 startS <- c(startS,oldStart) } } Nshunks <- i1 # divide chunks into n clusters: 1.chunk in 1.cluster # n.chunk in n.cluster (n+1)chunk in 1.cluster .... # Size of clusters is m # cluster1: 1 n+1 2n+1 ... 2m-1 # cluster2: 2 n+2 2n+2 ... 2m # Nshunks = n * m # # maximize dist: NewChunk1 to NewChunk2 (within clusters) -> maximize n # # maximize dist: chunk1 to chunk2 (between clusters) -> maximize m # n <- round(sqrt(Nshunks)) clusters <- (0:(Nshunks-1))%%n + 1 Sshunks <- c() for (i in 1:n) { cl <- which(clusters==i) Sshunks <- c(Sshunks,cl) } rra <- c() for (ss in 1:Nshunks) { st <- Sshunks[ss] ran <- startS[st]:endS[st] rra <- c(rra,ran) } NDpos <- Dpos[rra] Nannot <- annot[rra,] Nannot[,2] <- diffinv(NDpos)[1:snps]+1 write("Individuals: 1092",file="shuffle_annot.txt",ncolumns=10) write("SNPs : 3201157",file="shuffle_annot.txt",append=TRUE,ncolumns=10) write.table(format(Nannot,scientific=FALSE),file="shuffle_annot.txt",append=TRUE,quote=FALSE,row.names = FALSE,col.names = FALSE, sep = " ") indI <- (sort.int(rra, index.return = TRUE))$ix write(haploN,file="shuffle_mat.txt",ncolumns=10) write(snps,file="shuffle_mat.txt",append=TRUE,ncolumns=10) for (k in 1:haploN) { Lsnps <- Isnps[k] Tpos <- sort(indI[Ipos[[k]]]) write(Lsnps,file="shuffle_mat.txt",append=TRUE,ncolumns=10) write(Tpos,file="shuffle_mat.txt",append=TRUE,ncolumns=1000000) ones <- rep("1.0",Lsnps) write(ones,file="shuffle_mat.txt",append=TRUE,ncolumns=1000000) }