Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 103 additions & 0 deletions data/tcr_pmhc_db/test.idx
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
1bd2_P
1fo0_P
1g6r_P
1kj2_P
1lp9_P
1mwa_P
1nam_P
1qse_P
1rc3_P
2bnq_P
2bnr_P
2ckb_P
2e7l_P
2esv_P
2f53_P
2f54_P
2gj6_P
2nx5_P
2oi9_P
2ol3_P
2p5e_P
2p5w_P
2pye_P
2vlk_P
2vlr_P
2ypl_P
3dxa_P
3e3q_P
3ffc_P
3gsn_P
3h9s_P
3hg1_P
3kpr_P
3mv7_P
3mv8_P
3mv9_P
3o4l_P
3pqy_P
3qdg_P
3qdj_P
3qdm_P
3qeq_P
3qfj_P
3tfk_P
3tjh_P
3uts_P
3utt_P
3vxm_P
3vxr_P
3vxs_P
3w0w_P
4eup_P
4g8g_P
4jfd_P
4jfe_P
4jff_P
4l3e_P
4mji_P
4mnq_P
4mvb_P
4mxq_P
4prp_P
4qok_P
4qrp_P
5brz_P
5bs0_P
5c07_P
5c08_P
5c09_P
5c0a_P
5c0b_P
5c0c_P
5d2n_P
5e9d_P
5eu6_P
5euo_P
5hhm_P
5hho_P
5hyj_P
5ivx_P
5jhd_P
5m00_P
5men_P
5nme_P
5til_P
5wlg_P
5xov_P
5yxn_P
6TRo_P
6amu_P
6bj3_P
6dkp_P
6eqb_P
6g9q_P
6l9l_P
6mtm_P
6q3s_P
6rp9_P
6tmo_P
6vma_P
7jwj_P
7n1f_P
7rm4_P
61 changes: 46 additions & 15 deletions predict.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,30 +40,47 @@ if [ $# -eq 0 ]; then
help 1
fi

############################
echo "initialize db"
############################
db_dir=${CWD}/data/tcr_pmhc_db

for c in "M" "P" "A" "B"; do
if [ ! -e ${data_dir}_${c}.fa ]; then
find ${db_dir}/fasta -name "*_${c}.fasta" -exec awk '$0!=""{print $0}' {} \; > ${db_dir}_${c}.fa;
fi
done


csv_file=$*

# convert csv to fasta files
############################
echo "convert csv to fasta files"
############################
python ${CWD}/main.py csv_to_fasta \
--target_uri "${output_dir}${output_params}" \
--pid_prefix tcr_pmhc_test_ \
--default_y=1.0 \
--verbose \
${csv_file}

# make chain.idx
############################
echo "make chain.idx"
############################
cat ${output_dir}/mapping.idx_all | \
cut -f2 | \
awk -F _ '{printf("%s",$1);for (i=2;i<NF;++i) printf("_%s", $i); printf(" %s\n", $NF);}' | \
sort -T . | \
awk -f ${CWD}/scripts/collapse.awk > ${output_dir}/chain.idx_all

# filter out ones that has only one chain
# 1. load dict a (in test dataset) from attr.idx_all
# 2. filter out those that:
# i. has no peptide
# ii. only have peptide & MHC and in dict a
# iii.has only one chain
#
############################
echo "filter out ones that has only one chain"
echo " 1. load dict a (in test dataset) from attr.idx_all"
echo " 2. filter out those that:"
echo " i. has no peptide"
echo " ii. only have peptide & MHC and in dict a"
echo " iii.has only one chain"
############################
cat ${output_dir}/chain.idx_all | \
awk -v attr_idx=${output_dir}/attr.idx_all 'BEGIN{
while(getline<attr_idx) {
Expand All @@ -84,7 +101,9 @@ cat ${output_dir}/chain.idx_all | \
print $0;
}' > ${output_dir}/chain.idx_all_blacklist

# make attr.idx for test fold_i
############################
echo "make attr.idx"
############################
cat ${output_dir}/attr.idx_all | \
awk -v blacklist=${output_dir}/chain.idx_all_blacklist 'BEGIN{
a["xxxxxxxx"] = 1;
Expand All @@ -102,7 +121,9 @@ python ${CWD}/main.py attr_update_weight_and_task \
--weight 1.0 \
data/tcr_pmhc_db/attr.idx >> ${output_dir}/attr.idx

# build the dataset (test data included) mapping.idx and chain.idx
############################
echo "build the dataset: mapping.idx and chain.idx"
############################
cat ${CWD}/data/tcr_pmhc_db/mapping.idx ${output_dir}/mapping.idx_all > ${output_dir}/mapping.idx
cat ${output_dir}/mapping.idx | \
cut -f2 | \
Expand All @@ -111,7 +132,9 @@ cat ${output_dir}/mapping.idx | \
awk -f ${CWD}/scripts/collapse.awk > ${output_dir}/chain.idx


# build fasta for each chain
############################
echo "build fasta for each chain"
############################
for c in "A" "B" "P" "M"; do
python ${CWD}/main.py fasta_extract \
--target_uri ${output_dir} \
Expand All @@ -123,14 +146,18 @@ for c in "A" "B" "P" "M"; do
fi
done

# align A B M with jackhmmer
############################
echo "align chains A, B and M with jackhmmer"
############################
for c in "A" "B" "M"; do
find ${CWD}/data/tcr_pmhc_db/fasta -name "*_${c}.fasta" > ${output_dir}/tcr_pmhc_db_${c}
cat ${output_dir}/tcr_pmhc_db_${c} | ${CWD}/bin/mapred -m "uniref90_db=${output_dir}/tcr_pmhc_${c}.fa mgnify_db=${CWD}/data/tcr_pmhc_db_${c}.fa sh ${CWD}/scripts/run_jackhmmer.sh -o ${output_dir}/a3m" -c 10
cat ${output_dir}/tcr_pmhc_db_${c} | ${CWD}/bin/mapred -m "PIPELINE_UNIREF_MAX_HITS=1000000 PIPELINE_MGNIFY_MAX_HITS=1000000 PIPELINE_DEDUPLICATE=0 sh ${CWD}/scripts/run_pipeline.sh -o ${output_dir}/a3m" -c 10
done

# align P with equal length
############################
echo "align chain P with equal length"
############################
python ${CWD}/main.py peptide_align \
--output_dir ${output_dir}/a3m \
--target_db ${output_dir}/tcr_pmhc_P.fa \
Expand All @@ -144,19 +171,23 @@ for c in "P"; do
done

# filter a3m with threshold=t
############################
echo "filter a3m (MHC): align_ratio>=${mhc_align_ratio_threshold}"
############################
if [ -d ${output_dir}/var ]; then
rm -rf ${output_dir}/var
fi

echo "filter a3m (MHC): align_ratio>=${mhc_align_ratio_threshold}"
cp -r ${output_dir}/a3m ${output_dir}/var
python ${CWD}/main.py a3m_filter \
--output_dir ${output_dir}/var \
--aligned_ratio_threshold ${mhc_align_ratio_threshold} \
--trim_gap \
${CWD}/data/tcr_pmhc_db/fasta/*_M.fasta

############################
echo "predict ${csv_file}"
############################
python main.py predict \
${model_args} \
--output_dir ${output_dir}/pred \
Expand Down
2 changes: 1 addition & 1 deletion profold2
Submodule profold2 updated 69 files
+0 −1 .dockerignore
+38 −0 .github/ISSUE_TEMPLATE/bug_report.md
+20 −0 .github/ISSUE_TEMPLATE/feature_request.md
+18 −0 .github/workflows/docker-image.yml
+11 −11 README.md
+0 −355 data_maker.py
+11 −22 docker/Dockerfile
+0 −42 docker/openmm.patch
+2 −0 docker/requirements.txt
+69 −51 docker/run_docker.py
+19 −8 examples/evaluate.job
+20 −8 examples/predict.job
+20 −8 examples/train.job
+32 −4 install_env.sh
+2 −66 main.py
+7 −13 profold2/command/evaluator.py
+79 −0 profold2/command/main.py
+3 −2 profold2/command/predictor.py
+0 −0 profold2/command/relaxer.py
+1 −18 profold2/command/trainer.py
+45 −19 profold2/command/worker.py
+6 −2 profold2/common/protein.py
+805 −285 profold2/common/residue_constants.py
+337 −113 profold2/data/dataset.py
+2 −2 profold2/data/mmcif_parsing.py
+7 −7 profold2/data/pipeline.py
+1 −1 profold2/data/tools/hhblits.py
+1 −1 profold2/data/tools/hhsearch.py
+1 −1 profold2/data/tools/hmmbuild.py
+1 −1 profold2/data/tools/hmmsearch.py
+1 −1 profold2/data/tools/jackhmmer.py
+1 −1 profold2/data/tools/kalign.py
+64 −10 profold2/data/utils.py
+121 −81 profold2/model/alphafold2.py
+173 −71 profold2/model/commons.py
+194 −103 profold2/model/evoformer.py
+56 −170 profold2/model/features.py
+30 −29 profold2/model/folding.py
+165 −41 profold2/model/functional.py
+187 −116 profold2/model/head.py
+6 −14 profold2/model/kernel/builder.py
+17 −6 profold2/relax/amber_minimize.py
+49 −53 profold2/tools/energy.py
+23 −719 profold2/utils.py
+7 −5 requirements.txt
+0 −7 serving.properties
+0 −179 serving_client.py
+0 −102 serving_handler.py
+0 −45 serving_model.py
+0 −29 serving_workflow.py
+0 −18 serving_workflow.yaml
+15 −2 setup.py
+0 −148 tests/test_utils.py
+0 −196 tools/ab_add_plddt.py
+83 −0 tools/bprna_eval.py
+2 −2 tools/contact_plot.py
+144 −14 tools/dataset.py
+140 −48 tools/dataset_from_pdb.py
+0 −108 tools/domain_split.py
+357 −0 tools/fr3d_interaction_to_basepair.py
+144 −0 tools/model_ops.py
+0 −556 tools/msa_select.py
+22 −20 tools/pdb_convert.py
+271 −0 tools/pdb_extract_comp_id.py
+94 −80 web/app.py
+31 −18 web/db.py
+9 −5 web/form.py
+88 −56 web/task.py
+12 −13 web/utils.py