diff --git a/data/tcr_pmhc_db/test.idx b/data/tcr_pmhc_db/test.idx new file mode 100644 index 0000000..7b9a348 --- /dev/null +++ b/data/tcr_pmhc_db/test.idx @@ -0,0 +1,103 @@ +1bd2_P +1fo0_P +1g6r_P +1kj2_P +1lp9_P +1mwa_P +1nam_P +1qse_P +1rc3_P +2bnq_P +2bnr_P +2ckb_P +2e7l_P +2esv_P +2f53_P +2f54_P +2gj6_P +2nx5_P +2oi9_P +2ol3_P +2p5e_P +2p5w_P +2pye_P +2vlk_P +2vlr_P +2ypl_P +3dxa_P +3e3q_P +3ffc_P +3gsn_P +3h9s_P +3hg1_P +3kpr_P +3mv7_P +3mv8_P +3mv9_P +3o4l_P +3pqy_P +3qdg_P +3qdj_P +3qdm_P +3qeq_P +3qfj_P +3tfk_P +3tjh_P +3uts_P +3utt_P +3vxm_P +3vxr_P +3vxs_P +3w0w_P +4eup_P +4g8g_P +4jfd_P +4jfe_P +4jff_P +4l3e_P +4mji_P +4mnq_P +4mvb_P +4mxq_P +4prp_P +4qok_P +4qrp_P +5brz_P +5bs0_P +5c07_P +5c08_P +5c09_P +5c0a_P +5c0b_P +5c0c_P +5d2n_P +5e9d_P +5eu6_P +5euo_P +5hhm_P +5hho_P +5hyj_P +5ivx_P +5jhd_P +5m00_P +5men_P +5nme_P +5til_P +5wlg_P +5xov_P +5yxn_P +6TRo_P +6amu_P +6bj3_P +6dkp_P +6eqb_P +6g9q_P +6l9l_P +6mtm_P +6q3s_P +6rp9_P +6tmo_P +6vma_P +7jwj_P +7n1f_P +7rm4_P diff --git a/predict.sh b/predict.sh index eff6da9..34465b7 100755 --- a/predict.sh +++ b/predict.sh @@ -40,9 +40,23 @@ if [ $# -eq 0 ]; then help 1 fi +############################ +echo "initialize db" +############################ +db_dir=${CWD}/data/tcr_pmhc_db + +for c in "M" "P" "A" "B"; do + if [ ! -e ${data_dir}_${c}.fa ]; then + find ${db_dir}/fasta -name "*_${c}.fasta" -exec awk '$0!=""{print $0}' {} \; > ${db_dir}_${c}.fa; + fi +done + + csv_file=$* -# convert csv to fasta files +############################ +echo "convert csv to fasta files" +############################ python ${CWD}/main.py csv_to_fasta \ --target_uri "${output_dir}${output_params}" \ --pid_prefix tcr_pmhc_test_ \ @@ -50,20 +64,23 @@ python ${CWD}/main.py csv_to_fasta \ --verbose \ ${csv_file} -# make chain.idx +############################ +echo "make chain.idx" +############################ cat ${output_dir}/mapping.idx_all | \ cut -f2 | \ awk -F _ '{printf("%s",$1);for (i=2;i ${output_dir}/chain.idx_all -# filter out ones that has only one chain -# 1. load dict a (in test dataset) from attr.idx_all -# 2. filter out those that: -# i. has no peptide -# ii. only have peptide & MHC and in dict a -# iii.has only one chain -# +############################ +echo "filter out ones that has only one chain" +echo " 1. load dict a (in test dataset) from attr.idx_all" +echo " 2. filter out those that:" +echo " i. has no peptide" +echo " ii. only have peptide & MHC and in dict a" +echo " iii.has only one chain" +############################ cat ${output_dir}/chain.idx_all | \ awk -v attr_idx=${output_dir}/attr.idx_all 'BEGIN{ while(getline ${output_dir}/chain.idx_all_blacklist -# make attr.idx for test fold_i +############################ +echo "make attr.idx" +############################ cat ${output_dir}/attr.idx_all | \ awk -v blacklist=${output_dir}/chain.idx_all_blacklist 'BEGIN{ a["xxxxxxxx"] = 1; @@ -102,7 +121,9 @@ python ${CWD}/main.py attr_update_weight_and_task \ --weight 1.0 \ data/tcr_pmhc_db/attr.idx >> ${output_dir}/attr.idx -# build the dataset (test data included) mapping.idx and chain.idx +############################ +echo "build the dataset: mapping.idx and chain.idx" +############################ cat ${CWD}/data/tcr_pmhc_db/mapping.idx ${output_dir}/mapping.idx_all > ${output_dir}/mapping.idx cat ${output_dir}/mapping.idx | \ cut -f2 | \ @@ -111,7 +132,9 @@ cat ${output_dir}/mapping.idx | \ awk -f ${CWD}/scripts/collapse.awk > ${output_dir}/chain.idx -# build fasta for each chain +############################ +echo "build fasta for each chain" +############################ for c in "A" "B" "P" "M"; do python ${CWD}/main.py fasta_extract \ --target_uri ${output_dir} \ @@ -123,14 +146,18 @@ for c in "A" "B" "P" "M"; do fi done -# align A B M with jackhmmer +############################ +echo "align chains A, B and M with jackhmmer" +############################ for c in "A" "B" "M"; do find ${CWD}/data/tcr_pmhc_db/fasta -name "*_${c}.fasta" > ${output_dir}/tcr_pmhc_db_${c} cat ${output_dir}/tcr_pmhc_db_${c} | ${CWD}/bin/mapred -m "uniref90_db=${output_dir}/tcr_pmhc_${c}.fa mgnify_db=${CWD}/data/tcr_pmhc_db_${c}.fa sh ${CWD}/scripts/run_jackhmmer.sh -o ${output_dir}/a3m" -c 10 cat ${output_dir}/tcr_pmhc_db_${c} | ${CWD}/bin/mapred -m "PIPELINE_UNIREF_MAX_HITS=1000000 PIPELINE_MGNIFY_MAX_HITS=1000000 PIPELINE_DEDUPLICATE=0 sh ${CWD}/scripts/run_pipeline.sh -o ${output_dir}/a3m" -c 10 done -# align P with equal length +############################ +echo "align chain P with equal length" +############################ python ${CWD}/main.py peptide_align \ --output_dir ${output_dir}/a3m \ --target_db ${output_dir}/tcr_pmhc_P.fa \ @@ -144,11 +171,13 @@ for c in "P"; do done # filter a3m with threshold=t +############################ +echo "filter a3m (MHC): align_ratio>=${mhc_align_ratio_threshold}" +############################ if [ -d ${output_dir}/var ]; then rm -rf ${output_dir}/var fi -echo "filter a3m (MHC): align_ratio>=${mhc_align_ratio_threshold}" cp -r ${output_dir}/a3m ${output_dir}/var python ${CWD}/main.py a3m_filter \ --output_dir ${output_dir}/var \ @@ -156,7 +185,9 @@ python ${CWD}/main.py a3m_filter \ --trim_gap \ ${CWD}/data/tcr_pmhc_db/fasta/*_M.fasta +############################ echo "predict ${csv_file}" +############################ python main.py predict \ ${model_args} \ --output_dir ${output_dir}/pred \ diff --git a/profold2 b/profold2 index d41d31e..8d908aa 160000 --- a/profold2 +++ b/profold2 @@ -1 +1 @@ -Subproject commit d41d31e173a547630669e34bcd4cf95c6bfca3db +Subproject commit 8d908aa477f0ed8cbbfa46f36b25321b8a483382