Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ jobs:
matrix:
python-version: ["3.8", "3.12"]
steps:
- uses: actions/checkout@v2
- uses: actions/cache@v2
- uses: actions/checkout@v4
- uses: actions/cache@v4
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ hashFiles('**/setup.py') }}
Expand Down
2 changes: 1 addition & 1 deletion protmapper/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
__all__ = ['map_sites', 'get_site_annotations', 'MappedSite',
'InvalidSiteException', 'ProtMapper', 'default_mapper',
'resource_dir']
'resource_dir', 'mapped_sites_to_sites']


__version__ = '0.0.29'
Expand Down
147 changes: 108 additions & 39 deletions protmapper/api.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
__all__ = ['map_sites', 'get_site_annotations', 'MappedSite',
'InvalidSiteException', 'ProtMapper', 'default_mapper']
'InvalidSiteException', 'ProtMapper', 'default_mapper',
'mapped_sites_to_sites']

import os
import csv
Expand Down Expand Up @@ -171,22 +172,45 @@ def has_mapping(self):
return (not self.not_invalid()) and \
(self.mapped_pos is not None and self.mapped_res is not None)

def get_site(self):
def get_site(self, namespace='uniprot', include_namespace=True):
"""Return the site information as a tuple.

Parameters
----------
namespace : Optional[str]
The namespace to use for the protein ID. Default is 'uniprot'.
If choosing 'hgnc', the gene name will be used as the ID.
include_namespace : Optional[bool]
If True, include the namespace in the output. Default is True
which creates a 4 tuple of the form
(prot_id, prot_ns, residue, position). If False, a 3 tuple is
created of the form (prot_id, residue, position).

Returns
-------
tuple
A tuple containing the following entries:
(prot_id, prot_ns, residue, position).
(prot_id, prot_ns, residue, position) if include_namespace is True,
(prot_id, residue, position) otherwise.
"""
if namespace not in ('uniprot', 'hgnc'):
raise ValueError("Namespace must be 'uniprot' or 'hgnc'.")
if self.not_invalid() or not self.has_mapping():
return self.up_id, 'uniprot', self.orig_res, self.orig_pos
idx = self.up_id if namespace == 'uniprot' else self.gene_name
res = self.orig_res
pos = self.orig_pos
else:
return self.mapped_id, 'uniprot', self.mapped_res, self.mapped_pos
idx = self.mapped_id if namespace == 'uniprot' else self.gene_name
res = self.mapped_res
pos = self.mapped_pos
if include_namespace:
return idx, namespace, res, pos
else:
return idx, res, pos


def mapped_sites_to_sites(mapped_sites, include_invalid=False):
def mapped_sites_to_sites(mapped_sites, include_invalid=False,
namespace='uniprot', include_namespace=True):
"""Return a list of sites from a list of MappedSite objects.

Parameters
Expand All @@ -196,14 +220,24 @@ def mapped_sites_to_sites(mapped_sites, include_invalid=False):
include_invalid : Optional[bool]
If True, include sites that are known to be invalid in the output.
Default is False.
namespace : Optional[str]
The namespace to use for the protein ID. Default is 'uniprot'.
If choosing 'hgnc', the gene name will be used as the ID.
include_namespace : Optional[bool]
If True, include the namespace in the output. Default is True
which creates a 4 tuple of the form
(prot_id, prot_ns, residue, position). If False, a 3 tuple is
created of the form (prot_id, residue, position).

Returns
-------
list of tuple
A list of tuples, each containing the following entries:
(prot_id, prot_ns, residue, position).
"""
return [ms.get_site() for ms in mapped_sites
return [ms.get_site(namespace=namespace,
include_namespace=include_namespace)
for ms in mapped_sites
if ms.not_invalid() or include_invalid]


Expand All @@ -214,8 +248,11 @@ def get_site_annotations(sites):
----------
sites : list of tuple
Each tuple in the list consists of the following entries:
(prot_id, residue, position). where prot_id has to be a
UniProt ID.
(prot_id, prot_ns, residue, position). where prot_id
is the protein ID (Uniprot or HGNC), prot_ns is the namespace
of the protein ID ('uniprot' or 'hgnc'), residue is the amino acid
residue, and position is the amino acid position, both represented
as strings.

Returns
-------
Expand Down Expand Up @@ -247,9 +284,13 @@ def _load_annotations():
with open(annotations_fname, 'r') as fh:
reader = csv.DictReader(fh)
for row in reader:
site = (row['TARGET_UP_ID'], row['TARGET_RES'], row['TARGET_POS'])
site_up = (row['TARGET_UP_ID'], 'uniprot',
row['TARGET_RES'], row['TARGET_POS'])
site_hgnc = (row['TARGET_GENE_NAME'], 'hgnc',
row['TARGET_RES'], row['TARGET_POS'])
row['evidence'] = evidences.get(row['ID'])
annotations_by_site[site].append(row)
annotations_by_site[site_up].append(row)
annotations_by_site[site_hgnc].append(row)
annotations_by_site = dict(annotations_by_site)
return annotations_by_site

Expand Down Expand Up @@ -389,7 +430,8 @@ def map_sitelist_to_human_ref(self, site_list, do_methionine_offset=True,
def map_to_human_ref(self, prot_id, prot_ns, residue, position,
do_methionine_offset=True,
do_orthology_mapping=True,
do_isoform_mapping=True):
do_isoform_mapping=True,
check_psp=True):
"""Check an agent for invalid sites and look for mappings.

Look up each modification site on the agent in Uniprot and then the
Expand Down Expand Up @@ -499,55 +541,82 @@ def map_to_human_ref(self, prot_id, prot_ns, residue, position,
self._cache[site_key] = mapped_site
return mapped_site

# There is no manual mapping, next we try to see if UniProt
# reports a signal peptide that could be responsible for the position
# being shifted
signal_peptide = uniprot_client.get_signal_peptide(up_id, False)
# If there is valid signal peptide information from UniProt
if signal_peptide and signal_peptide.begin == 1 and \
signal_peptide.end is not None:
offset_pos = str(int(position) + signal_peptide.end)
# Check to see if the offset position is known to be phosphorylated
mapped_site = self.get_psp_mapping(
up_id, up_id, gene_name, residue, position,
offset_pos, 'SIGNAL_PEPTIDE_REMOVED')
if mapped_site:
return mapped_site
# ...there's no manually curated site or signal peptide, so do mapping
# via PhosphoSite if the data is available:
human_prot = uniprot_client.is_human(up_id)
if phosphosite_client.has_data():
mapped_site = None
if phosphosite_client.has_data() or not check_psp:
# See if UniProt reports a signal peptide that could be responsible
# for the position being shifted
signal_peptide = uniprot_client.get_signal_peptide(up_id, False)
# If there is valid signal peptide information from UniProt
if signal_peptide and signal_peptide.begin == 1 and \
signal_peptide.end is not None:
offset_pos = str(int(position) + signal_peptide.end)
# Check to see if the offset position is known to be phosphorylated
if check_psp:
mapped_site = self.get_psp_mapping(
up_id, up_id, gene_name, residue, position,
offset_pos, 'SIGNAL_PEPTIDE_REMOVED')
else:
site_valid = uniprot_client.verify_location(up_id, residue,
offset_pos)
if site_valid:
mapped_site = MappedSite(
up_id, False, residue, position,
mapped_id=up_id,
mapped_res=residue,
mapped_pos=offset_pos,
description='SIGNAL_PEPTIDE_REMOVED',
gene_name=gene_name)
if mapped_site:
return mapped_site
# First, look for other entries in phosphosite for this protein
# where this sequence position is legit (i.e., other isoforms)
if do_isoform_mapping and up_id and human_prot:
mapped_site = self.get_psp_mapping(
up_id, up_id, gene_name, residue, position, position,
'INFERRED_ALTERNATIVE_ISOFORM')
if check_psp:
mapped_site = self.get_psp_mapping(
up_id, up_id, gene_name, residue, position, position,
'INFERRED_ALTERNATIVE_ISOFORM')
if mapped_site:
return mapped_site
# Try looking for rat or mouse sites
if do_orthology_mapping and up_id and human_prot:
# Get the mouse ID for this protein
up_mouse = uniprot_client.get_mouse_id(up_id)
# Get mouse sequence
mapped_site = self.get_psp_mapping(
up_id, up_mouse, gene_name, residue,
position, position, 'INFERRED_MOUSE_SITE')
if check_psp:
mapped_site = self.get_psp_mapping(
up_id, up_mouse, gene_name, residue,
position, position, 'INFERRED_MOUSE_SITE')
if mapped_site:
return mapped_site
# Try the rat sequence
up_rat = uniprot_client.get_rat_id(up_id)
mapped_site = self.get_psp_mapping(
up_id, up_rat, gene_name, residue, position,
position, 'INFERRED_RAT_SITE')
if check_psp:
mapped_site = self.get_psp_mapping(
up_id, up_rat, gene_name, residue, position,
position, 'INFERRED_RAT_SITE')
if mapped_site:
return mapped_site
# Check for methionine offset (off by one)
if do_methionine_offset and up_id and human_prot:
offset_pos = str(int(position) + 1)
mapped_site = self.get_psp_mapping(
up_id, up_id, gene_name, residue, position,
offset_pos, 'INFERRED_METHIONINE_CLEAVAGE')
if check_psp:
mapped_site = self.get_psp_mapping(
up_id, up_id, gene_name, residue, position,
offset_pos, 'INFERRED_METHIONINE_CLEAVAGE')
else:
site_valid = uniprot_client.verify_location(
up_id, residue, offset_pos)
if site_valid:
mapped_site = MappedSite(
up_id, False, residue, position,
mapped_id=up_id,
mapped_res=residue,
mapped_pos=offset_pos,
description='INFERRED_METHIONINE_CLEAVAGE',
gene_name=gene_name)
if mapped_site:
return mapped_site
# If we've gotten here, the entry is 1) not in the site map, and
Expand Down
31 changes: 31 additions & 0 deletions protmapper/tests/test_protmapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,3 +466,34 @@ def test_mutliple_up_ids():
pm = ProtMapper()
ms = pm.map_peptide_to_human_ref('TMPO', 'hgnc', 'RKVPRLSEKSVEE', 7)
assert ms.up_id == 'P42166'


def test_no_psp_check():
pm = ProtMapper()

# Signal peptide
ms = pm.map_to_human_ref('TTR', 'hgnc', 'S', '52',
check_psp=False)
assert ms == MappedSite(up_id='P02766', error_code=None, valid=False,
orig_res='S', orig_pos='52', mapped_id='P02766',
mapped_res='S', mapped_pos='72',
description='SIGNAL_PEPTIDE_REMOVED',
gene_name='TTR')
# Initial methionine
ms = pm.map_to_human_ref('TTR', 'hgnc', 'S', '71',
check_psp=False)
assert ms == MappedSite(up_id='P02766', error_code=None, valid=False,
orig_res='S', orig_pos='71', mapped_id='P02766',
mapped_res='S', mapped_pos='72',
description='INFERRED_METHIONINE_CLEAVAGE',
gene_name='TTR')

# A site that doesn't map to anything valid which is detected
# even without PSP checking
ms = pm.map_to_human_ref('TTR', 'hgnc', 'S', '73',
check_psp=False)
assert ms == MappedSite(up_id='P02766', error_code=None, valid=False,
orig_res='S', orig_pos='73', mapped_id=None,
mapped_res=None, mapped_pos=None,
description='NO_MAPPING_FOUND',
gene_name='TTR')