Source code for crops.iomod.taggers
"""Tags, names, etc, functions and objects are defined here."""
from crops import __prog__, __description__, __author__
from crops import __date__, __version__, __copyright__
import os
import logging
[docs]def target_format(inpath, terms=False, th=0, notfound=False):
"""Return information about the interval source for .fasta headers.
:param inpath: Path to interval database used.
:type inpath: str
:param terms: Only discard terminal segments, defaults to False.
:type terms: bool, optional
:param th: Uniprot threshold (% of original UP sequence below which segment is removed), defaults to 0.
:type th: int or float, optional
:param notfound: The sequence was not found in interval source, defaults to False.
:type notfound: bool, optional
:raises TypeError: If `th` is not a numeric (int, float) value.
:raises TypeError: If `terms` is not boolean.
:return: Extra information for .fasta headers
:rtype: str
"""
try:
th = float(th)
except Exception:
logging.critical('The threshold must be a numeric (int, float) value.')
raise TypeError
if isinstance(terms, bool) is False:
logging.critical("The 'terms' variable must have a boolean value.")
raise TypeError
if isinstance(notfound, bool) is False:
logging.critical("The 'notfound' variable must have a boolean value.")
raise TypeError
if os.path.basename(inpath) == 'pdb_chain_uniprot.csv':
src = 'SIFTS database'
else:
src = 'Custom database'
if notfound is True:
outcome = '|CROPS Unaltered sequence - reference not found in ' + src + '.)'
else:
if os.path.basename(inpath) == 'pdb_chain_uniprot.csv':
outcome = '|CROPS (UniProt via ' + src + ')'
outcome += ' - UPmin = ' + str(th) + ' %'
else:
outcome = '|CROPS (' + src + ')'
if terms is True and th == 0:
outcome += ' - Only terminals removed'
return outcome
[docs]def infix_gen(inpath, terms=False):
"""Return information about the interval source for file names.
:param inpath: Path to interval database used.
:type inpath: str
:param terms: Only discard terminal segments, defaults to False.
:type terms: bool, optional
:raises TypeError: If `terms` is not boolean.
:return: Filename's infix tag.
:rtype: str
"""
if isinstance(terms, bool) is False:
logging.critical("The 'terms' variable must have a boolean value.")
raise TypeError
if os.path.basename(inpath) == 'pdb_chain_uniprot.csv':
cut = ".to_uniprot"
else:
cut = ".custom"
if terms:
cut = ".custom"
infix_out = {
"croprenum": ".crops" + cut,
"cropseq": ".crops" + cut,
"crop": ".crops.oldids" + cut,
"renumber": ".crops.seq"}
return infix_out
[docs]def retrieve_id(seqheader):
"""Extract sequence IDs and additional comments from a standard .fasta header.
:param seqheader: Standard .fasta header, starting with ">".
:type seqheader: str
:raises ValueError: If `seqheader` is not a string.
:return: A dictionary with the sequence identifiers ('mainid', 'chains', 'seqid', 'source', 'comments').
:rtype: dict [str, str or set]
"""
if not isinstance(seqheader, str):
logging.critical('Argument is not a string.')
raise ValueError
headerinfo = {}
headerinfo['mainid'] = ""
headerinfo['chains'] = None
headerinfo['seqid'] = None
headerinfo['source'] = None
headerinfo['comments'] = None
namechar = False
idchar = False
newchid = ''
# UniProt Swiss-Prot/TrEMBL
if seqheader.startswith('>sp|') or seqheader.startswith('>tr|'):
headerinfo['seqid'] = '1'
if seqheader.startswith('>sp|'):
headerinfo['source'] = 'UniProtKB/SwissProt'
elif seqheader.startswith('>tr|'):
headerinfo['source'] = 'UniProtKB/TrEMBL'
for i in range(4, len(seqheader)):
if seqheader[i] == '|':
headerinfo['comments'] = seqheader[i+1:]
break
elif seqheader[i] == ' ' and seqheader[i:i+9] == ' archived':
headerinfo['source'] += ' (archived)'
headerinfo['comments'] = seqheader[i+1:]
break
else:
headerinfo['mainid'] += seqheader[i].upper()
if headerinfo['chains'] is None:
headerinfo['chains'] = set()
headerinfo['chains'].add(headerinfo['mainid'])
# UniRef
elif seqheader.startswith('>UniRef'):
headerinfo['seqid'] = '1'
for i in range(1, len(seqheader)):
if seqheader[i] == '_':
headerinfo['source'] = seqheader[1:i]
chi = i+1
elif seqheader[i] == ' ':
headerinfo['comments'] = seqheader[i+1:]
headerinfo['mainid'] = seqheader[chi:i].upper()
if headerinfo['chains'] is None:
headerinfo['chains'] = set()
headerinfo['chains'].add(headerinfo['mainid'])
break
# UniParc
elif seqheader.startswith('>UPI'):
headerinfo['seqid'] = '1'
for i in range(4, len(seqheader)):
headerinfo['source'] = 'UniParc'
if seqheader[i] == ' ':
headerinfo['comments'] = seqheader[i+1:]
headerinfo['mainid'] = seqheader[1:i].upper()
if headerinfo['chains'] is None:
headerinfo['chains'] = set()
headerinfo['chains'].add(headerinfo['mainid'])
break
# UniClust
elif seqheader.startswith('>uc'):
newchid = ''
headerinfo['source'] = 'UniClust'
tag = 0
for i in range(1, len(seqheader)):
newchid += seqheader[i]
if seqheader[i] == '-':
if tag == 0:
headerinfo['source'] = ('UniClust'+seqheader[3:i] + '_20' +
seqheader[i+1:i+3] + '_' +
seqheader[i+3:i+5])
tag = 1
elif tag == 1:
ii = i
newchid = ''
while True:
ii += 1
if seqheader[ii] == '|':
headerinfo['seqid'] = newchid
headerinfo['comments'] = seqheader[ii+1:]
tag = 2
break
newchid += seqheader[ii]
elif seqheader[i:i+16] == '|Representative=':
newchid = ''
ii = i + 16
while True:
newchid += seqheader[ii]
ii += 1
if seqheader[ii] == ' ':
headerinfo['mainid'] = newchid.upper()
break
elif seqheader[i:i+8] == 'Members=':
newchid = ''
headerinfo['chains'] = set()
for ii in range(i+8, len(seqheader)):
if seqheader[ii] == ',' or ii == len(seqheader)-1:
headerinfo['chains'].add(newchid)
newchid = ''
else:
newchid += seqheader[ii]
break
# PDBe
elif seqheader.startswith('>pdb|'):
headerinfo['source'] = 'PDBe'
newchid = ''
tag = 'mainid'
for i in range(5, len(seqheader)):
if seqheader[i] == '|':
if tag == 'mainid':
headerinfo[tag] = newchid.upper()
else:
headerinfo[tag] = newchid
tag = 'chains'
newchid = ''
elif seqheader[i] == '_':
if tag == 'mainid':
headerinfo[tag] = newchid.upper()
else:
headerinfo[tag] = newchid
tag = 'seqid'
newchid = ''
elif (seqheader[i] == ' ' or
seqheader[i] == ',' or
i == len(seqheader)-1):
if i == len(seqheader)-1:
newchid += seqheader[i]
if headerinfo[tag] is None:
headerinfo[tag] = set()
headerinfo[tag].add(newchid)
newchid = ''
else:
if tag == 'mainid':
newchid += seqheader[i].upper()
else:
newchid += seqheader[i]
else:
# RCSB PDB, CROPS, MrBUMP, others
if seqheader.startswith('>crops|'):
seqheader = ">" + seqheader[7:]
headerinfo['source'] = 'CROPS'
else:
headerinfo['source'] = 'RCSB PDB'
for j in range(len(seqheader)):
if seqheader[j] == ">":
idchar = True
elif seqheader[j] == ":" or seqheader[j] == "_":
idchar = False
namechar = True
elif seqheader[j] == " ":
if seqheader[j-2] == "_" and newchid != '':
try:
int(newchid)
headerinfo['seqid'] = newchid
newchid = ''
except Exception:
if headerinfo['chains'] is None:
headerinfo['chains'] = set()
headerinfo['chains'].add(newchid)
headerinfo['comments'] = seqheader[j+1:]
if seqheader[j+1:j+12] == "resolution:":
headerinfo['source'] = 'MrBUMP'
break
else:
pass
elif seqheader[j] == "[":
if seqheader[j:j+5] == "[auth" or seqheader[j:j+6] == "[ auth":
newchid = ''
elif ((seqheader[j] == "a" and seqheader[j:j+4] == 'auth') or
(seqheader[j] == "u" and seqheader[j-1:j+3] == 'auth') or
(seqheader[j] == "t" and seqheader[j-2:j+2] == 'auth') or
(seqheader[j] == "h" and seqheader[j-3:j+1] == 'auth')):
pass
elif seqheader[j] == "]":
pass
elif seqheader[j] == "|":
if idchar is True:
idchar = False
if seqheader[j+1:j+6].lower() == 'chain':
if newchid != '':
headerinfo['seqid'] = newchid
k = 0 if seqheader[j+6] == ' ' else 1
newchid = ''
for jj in range(j+6+k+1, len(seqheader)):
if seqheader[jj] == ',':
if headerinfo['chains'] is None:
headerinfo['chains'] = set()
headerinfo['chains'].add(newchid)
newchid = ''
elif seqheader[jj] == " ":
pass
elif seqheader[jj] == "[":
if (seqheader[jj:jj+5] == "[auth" or
seqheader[jj:jj+6] == "[ auth"):
newchid = ''
elif ((seqheader[jj] == "a" and seqheader[jj:jj+4] == 'auth') or
(seqheader[jj] == "u" and seqheader[jj-1:jj+3] == 'auth') or
(seqheader[jj] == "t" and seqheader[jj-2:jj+2] == 'auth') or
(seqheader[jj] == "h" and seqheader[jj-3:jj+1] == 'auth')):
pass
elif seqheader[jj] == "]":
pass
elif (seqheader[jj] == "|" or seqheader[jj] == ":" or
jj == len(seqheader)-1):
if jj == len(seqheader)-1:
newchid += seqheader[jj]
else:
headerinfo['comments'] = seqheader[jj+1:]
if headerinfo['chains'] is None:
headerinfo['chains'] = set()
headerinfo['chains'].add(newchid)
newchid = ''
namechar = False
return headerinfo
else:
newchid += seqheader[jj]
else:
if namechar is True:
headerinfo['comments'] = seqheader[j+1:]
if headerinfo['chains'] is None:
headerinfo['chains'] = set()
headerinfo['chains'].add(newchid)
else:
pass
break
elif seqheader[j] == " " and seqheader[j-1] != "_":
headerinfo['comments'] = seqheader[j+1:]
if namechar is True:
if headerinfo['chains'] is None:
headerinfo['chains'] = set()
headerinfo['chains'].add(newchid)
break
else:
if namechar is True:
newchid += seqheader[j]
elif idchar is True:
headerinfo['mainid'] += seqheader[j].upper()
return headerinfo
[docs]def makeheader(mainid=None, seqid=None, chains=None,
source=None, extrainfo=None, short=False):
"""Return a fasta header of the format ">crops|MainID_seqID|Chains chain list|extrainfo".
:param mainid: PDB ID, Uniprot ID, etc.
:type mainid: str
:param seqid: Sequence Identifier, usually a natural number: "1", "2", etc, defaults to None.
:type seqid: str, optional
:param chains: A set containing the chain IDs of monomers sharing the same sequence, defaults to None.
:type chains: set [str], optional
:param source: The source of the sequence ('RCSB PDB', 'UniProtKB/SwissProt', 'PDBe', etc).
:type source: str, optional
:param extrainfo: Additional information to be included in the header, defaults to None.
:type extrainfo: str, optional
:param short: If True, a short version of the header ('>MainID_seqID|Chains chain list') is returned, defaults to False.
:type extrainfo: bool, optional
:raises ValueError: If any of `mainid`, `seqid`, `extrainfo` or elements of `chains` are not strings, or `chains` is not a set.
:return: A fasta header.
:rtype: str
"""
if not isinstance(mainid, str):
try:
mainid = str(mainid)
except Exception:
logging.critical("Argument 'mainid' is not a string.")
raise ValueError
if short is True:
newheader = '>' + mainid.upper()
else:
newheader = '>crops|' + mainid
if seqid is not None:
if not isinstance(seqid, str):
try:
seqid = str(seqid)
except Exception:
logging.critical("Argument 'seqid' is not a string.")
raise ValueError
newheader += '_'
newheader += seqid
newheader += '|'
if chains is not None:
if not isinstance(chains, set):
logging.critical("Argument 'chains' is not a set of strings.")
raise ValueError
else:
for element in chains:
if not isinstance(element, str):
logging.critical("Argument 'chains' is not a set of strings.")
raise ValueError
if len(chains) > 0:
if len(chains) > 1:
newheader += 'Chains '
else:
newheader += 'Chain '
for element in sorted(chains):
newheader += element
newheader += ','
newheader = newheader[:-1]
if not short:
newheader += '|'
if not short:
if source is None:
source = 'Unknown'
if not isinstance(source, str):
try:
source = str(source)
except Exception:
logging.critical("Argument 'source' is not a string.")
raise ValueError
newheader += 'Source: '
newheader += source
newheader += '|'
if extrainfo is not None and not short:
if not isinstance(extrainfo, str):
try:
extrainfo = str(extrainfo)
except Exception:
logging.critical('Argument extrainfo is not a list of strings.')
raise ValueError
newheader += extrainfo
return newheader