-
Notifications
You must be signed in to change notification settings - Fork 15
make_ld_matrix
Oleksandr Frei edited this page Dec 5, 2016
·
13 revisions
Making single LD matrix across all chromosomes can be challenging due to memory limitations. We suggest the following approach:
import glob
import os.path
import os
import subprocess
import sys
import pandas as pd
def execute_command(command):
print("Execute command: {}".format(command))
print(subprocess.Popen(command.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()[0].decode("utf-8"))
#print(subprocess.check_output(command.split()).decode("utf-8"))
ref = {'9m': r'C:\Users\Oleksandr\Documents\GitHub\python_convert\9279485_ref.bim',
'2m': r'C:\Users\Oleksandr\Documents\GitHub\python_convert\2558411_ref.bim'}
tasks = [('2m', 'p8', 0.8),
('2m', 'p2', 0.2),
('2m', 'p1', 0.1),
('9m', 'p8', 0.8),
('9m', 'p2', 0.2),
('9m', 'p1', 0.1)]
for template, thr_name, thr_value in tasks:
for chr in range(1, 23): # we skip chr23 - this is intended
file = 'E:\{0}\ALL.chr{1}.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz'.format(template, chr);
execute_command(r'python make_ld_matrix.py --vcf {4} --ref {5} --ld_window_kb 10000 --ld_window_r2 {2} --chunksize 1000000 --savemat E:\{0}\ldmat_{0}_{1}_chr{3}.mat'.format(template, thr_name, thr_value, chr, file, ref[template]))
Later all matrices can be merged within matlab code.
function [LDmat, mafvec] = join_ld(file_pattern)
id1 = {}; id2 = {}; mafvec = [];
for i=1:22
tmp = load(sprintf('%s_chr%i.mat', file_pattern, i));
id1{i} = tmp.id1'; id2{i} = tmp.id2'; nsnp = tmp.nsnp;
if isempty(mafvec), mafvec = nan(nsnp, 1); end;
mafvec(~isnan(tmp.mafvec)) = tmp.mafvec(~isnan(tmp.mafvec));
end
LDmat = sparse(double([id1{:}]'), double([id2{:}]'), true, double(nsnp), double(nsnp));
LDmat = LDmat | speye(double(nsnp));
LDmat = LDmat | (LDmat - LDmat');
end
#folder = 'H:\\ftp\\ip113.ucsd.edu\\1000Genome\\phase3\\build37_released\\tmp';
folder = '/space/syn03/1/data/oleksandr/LDmatrix/tmp';
filename = 'ldmat_2m_p8'; [LDr2_p8sparse, mafvec] = join_ld(fullfile(folder, filename)); save(filename, 'LDr2_p8sparse', 'mafvec', '-v7.3'); clear('LDr2_p8sparse');
filename = 'ldmat_2m_p2'; [LDr2_p2sparse, mafvec] = join_ld(fullfile(folder, filename)); save(filename, 'LDr2_p2sparse', 'mafvec', '-v7.3'); clear('LDr2_p2sparse');
filename = 'ldmat_2m_p1'; [LDr2_p1sparse, mafvec] = join_ld(fullfile(folder, filename)); save(filename, 'LDr2_p1sparse', 'mafvec', '-v7.3'); clear('LDr2_p1sparse');
filename = 'ldmat_9m_p8'; [LDr2_p8sparse, mafvec] = join_ld(fullfile(folder, filename)); save(filename, 'LDr2_p8sparse', 'mafvec', '-v7.3'); clear('LDr2_p8sparse');
filename = 'ldmat_9m_p2'; [LDr2_p2sparse, mafvec] = join_ld(fullfile(folder, filename)); save(filename, 'LDr2_p2sparse', 'mafvec', '-v7.3'); clear('LDr2_p2sparse');
filename = 'ldmat_9m_p1'; [LDr2_p1sparse, mafvec] = join_ld(fullfile(folder, filename)); save(filename, 'LDr2_p1sparse', 'mafvec', '-v7.3'); clear('LDr2_p1sparse');