#### generate simulation data # population vectors n.AF=n.EU=0 q.true =c (rep(0.1,50),rep(0.9,50),rep(0,n.AF),rep(1,n.EU)) # first the admixed individuals, then the ones with known ancestry n = length(q.true) I0 = n-n.AF-n.EU M=100 # number of markers # marker probabilities p.EU = runif(M)/2 p.AF = runif(M)/2 # matrix of marker probabilities for each individual p.n = outer (p.EU, q.true) + outer (p.AF, 1-q.true) # generate observed data (two copies of each marker) G = array(data = c((runif(n*M))<=as.vector(p.n),(runif(n*M))<=as.vector(p.n)),dim=c(M,n,2)) #### end data simulation #### if you already have data start from here (initialize I0=n, n.AF=n.EU=0 if no a-priori known samples) # initialize E,Q E = array (runif (M*n*2), dim = c(M,n,2)) # no K dimension because there are two so Eima.EU = 1-Eima.AF if (n.AF > 0) E[,(I0+1):(I0+n.AF),]=0 if (n.EU > 0) E[,(I0+n.AF+1):(I0+n.AF+n.EU),]=1 Q=Q.prev = (c(runif(I0), q.true[-(1:I0)])) for (i in 1: 1000){ #M step Q[1:I0] = (apply (E[,1:I0,1],2,sum) + apply (E[,1:I0,2],2,sum))/(2*M) P.EU = (apply (E[,,1]*G[,,1],1,sum) + apply (E[,,2]*G[,,2],1,sum)) / (apply (E[,,1],1,sum) + apply (E[,,2],1,sum)) # print(cor (p.EU, P.EU)) P.AF = (apply ((1-E[,,1])*G[,,1],1,sum) + apply ((1-E[,,2])*G[,,2],1,sum)) / (apply ((1-E[,,1]),1,sum) + apply ((1-E[,,2]),1,sum)) # print(cor (p.AF, P.AF)) if (max(abs(Q-Q.prev))<0.00001) break if (i%%10 == 0) cat ("iter",i,"max change in Q: ",max(abs(Q-Q.prev)),"\n") if (max(abs(Q-Q.prev)) < 0.0005) break # can try other stopping criteria! # print(Q) #E step E=G E[,,1] = (outer (P.EU,Q) * G[,,1] + outer ((1-P.EU),Q) * (1-G[,,1]))/ ((outer (P.EU,Q) * G[,,1] + outer ((1-P.EU),Q) * (1-G[,,1])) + (outer (P.AF,(1-Q)) * G[,,1] + outer ((1-P.AF),(1-Q)) * (1-G[,,1]))) E[,,2] = (outer (P.EU,Q) * G[,,2] + outer ((1-P.EU),Q) * (1-G[,,2]))/ ((outer (P.EU,Q) * G[,,2] + outer ((1-P.EU),Q) * (1-G[,,2])) + (outer (P.AF,(1-Q)) * G[,,2] + outer ((1-P.AF),(1-Q)) * (1-G[,,2]))) if (n.AF > 0) E[,(I0+1):(I0+n.AF),]=0 if (n.EU > 0) E[,(I0+n.AF+1):(I0+n.AF+n.EU),]=1 Q.prev=Q # calculate likelihood PQ = outer(P.EU,Q) + outer(P.AF,(1-Q)) inds = cbind (rep(1:M,n*2), rep(1:n,2,each=M),rep(1:2,each=n*M)) ll = sum(log( PQ[inds[,1:2]]*G[inds] + (1-PQ[inds[,1:2]]*(!G[inds])))) if (i%%10 == 0) cat ("observed data ll:",ll,"\n") } Q[1:I0] summary(abs(Q[1:I0]-q.true[1:I0])) plot (Q[1:I0]) lines (smooth (Q[1:I0])) cor (p.EU, P.EU) cor (p.AF, P.EU)