Source code for crops.iomod.taggers

"""Tags, names, etc, functions and objects are defined here."""

from crops import __prog__, __description__, __author__
from crops import __date__, __version__, __copyright__

import os
import logging


[docs]def target_format(inpath, terms=False, th=0, notfound=False):
    """Return information about the interval source for .fasta headers.

    :param inpath: Path to interval database used.
    :type inpath: str
    :param terms: Only discard terminal segments, defaults to False.
    :type terms: bool, optional
    :param th: Uniprot threshold (% of original UP sequence below which segment is removed), defaults to 0.
    :type th: int or float, optional
    :param notfound: The sequence was not found in interval source, defaults to False.
    :type notfound: bool, optional

    :raises TypeError: If `th` is not a numeric (int, float) value.
    :raises TypeError: If `terms` is not boolean.

    :return: Extra information for .fasta headers
    :rtype: str

    """
    try:
        th = float(th)
    except Exception:
        logging.critical('The threshold must be a numeric (int, float) value.')
        raise TypeError
    if isinstance(terms, bool) is False:
        logging.critical("The 'terms' variable must have a boolean value.")
        raise TypeError
    if isinstance(notfound, bool) is False:
        logging.critical("The 'notfound' variable must have a boolean value.")
        raise TypeError

    if os.path.basename(inpath) == 'pdb_chain_uniprot.csv':
        src = 'SIFTS database'
    else:
        src = 'Custom database'
    if notfound is True:
        outcome = '|CROPS Unaltered sequence - reference not found in ' + src + '.)'
    else:
        if os.path.basename(inpath) == 'pdb_chain_uniprot.csv':
            outcome = '|CROPS (UniProt via ' + src + ')'
            outcome += ' - UPmin = ' + str(th) + ' %'
        else:
            outcome = '|CROPS (' + src + ')'
        if terms is True and th == 0:
            outcome += ' - Only terminals removed'

    return outcome


[docs]def infix_gen(inpath, terms=False):
    """Return information about the interval source for file names.

    :param inpath: Path to interval database used.
    :type inpath: str
    :param terms: Only discard terminal segments, defaults to False.
    :type terms: bool, optional

    :raises TypeError: If `terms` is not boolean.

    :return: Filename's infix tag.
    :rtype: str

    """
    if isinstance(terms, bool) is False:
        logging.critical("The 'terms' variable must have a boolean value.")
        raise TypeError

    if os.path.basename(inpath) == 'pdb_chain_uniprot.csv':
        cut = ".to_uniprot"
    else:
        cut = ".custom"

    if terms:
        cut = ".custom"

    infix_out = {
        "croprenum": ".crops" + cut,
        "cropseq": ".crops" + cut,
        "crop": ".crops.oldids" + cut,
        "renumber": ".crops.seq"}

    return infix_out


[docs]def retrieve_id(seqheader):
    """Extract sequence IDs and additional comments from a standard .fasta header.

    :param seqheader: Standard .fasta header, starting with ">".
    :type seqheader: str

    :raises ValueError: If `seqheader` is not a string.

    :return: A dictionary with the sequence identifiers ('mainid', 'chains', 'seqid', 'source', 'comments').
    :rtype: dict [str, str or set]

    """
    if not isinstance(seqheader, str):
        logging.critical('Argument is not a string.')
        raise ValueError
    headerinfo = {}
    headerinfo['mainid'] = ""
    headerinfo['chains'] = None
    headerinfo['seqid'] = None
    headerinfo['source'] = None
    headerinfo['comments'] = None
    namechar = False
    idchar = False
    newchid = ''
    # UniProt Swiss-Prot/TrEMBL
    if seqheader.startswith('>sp|') or seqheader.startswith('>tr|'):
        headerinfo['seqid'] = '1'
        if seqheader.startswith('>sp|'):
            headerinfo['source'] = 'UniProtKB/SwissProt'
        elif seqheader.startswith('>tr|'):
            headerinfo['source'] = 'UniProtKB/TrEMBL'
        for i in range(4, len(seqheader)):
            if seqheader[i] == '|':
                headerinfo['comments'] = seqheader[i+1:]
                break
            elif seqheader[i] == ' ' and seqheader[i:i+9] == ' archived':
                headerinfo['source'] += ' (archived)'
                headerinfo['comments'] = seqheader[i+1:]
                break
            else:
                headerinfo['mainid'] += seqheader[i].upper()
        if headerinfo['chains'] is None:
            headerinfo['chains'] = set()
        headerinfo['chains'].add(headerinfo['mainid'])

    # UniRef
    elif seqheader.startswith('>UniRef'):
        headerinfo['seqid'] = '1'
        for i in range(1, len(seqheader)):
            if seqheader[i] == '_':
                headerinfo['source'] = seqheader[1:i]
                chi = i+1
            elif seqheader[i] == ' ':
                headerinfo['comments'] = seqheader[i+1:]
                headerinfo['mainid'] = seqheader[chi:i].upper()
                if headerinfo['chains'] is None:
                    headerinfo['chains'] = set()
                headerinfo['chains'].add(headerinfo['mainid'])
                break

    # UniParc
    elif seqheader.startswith('>UPI'):
        headerinfo['seqid'] = '1'
        for i in range(4, len(seqheader)):
            headerinfo['source'] = 'UniParc'
            if seqheader[i] == ' ':
                headerinfo['comments'] = seqheader[i+1:]
                headerinfo['mainid'] = seqheader[1:i].upper()
                if headerinfo['chains'] is None:
                    headerinfo['chains'] = set()
                headerinfo['chains'].add(headerinfo['mainid'])
                break

    # UniClust
    elif seqheader.startswith('>uc'):
        newchid = ''
        headerinfo['source'] = 'UniClust'
        tag = 0
        for i in range(1, len(seqheader)):
            newchid += seqheader[i]
            if seqheader[i] == '-':
                if tag == 0:
                    headerinfo['source'] = ('UniClust'+seqheader[3:i] + '_20' +
                                            seqheader[i+1:i+3] + '_' +
                                            seqheader[i+3:i+5])
                    tag = 1
                elif tag == 1:
                    ii = i
                    newchid = ''
                    while True:
                        ii += 1
                        if seqheader[ii] == '|':
                            headerinfo['seqid'] = newchid
                            headerinfo['comments'] = seqheader[ii+1:]
                            tag = 2
                            break
                        newchid += seqheader[ii]
            elif seqheader[i:i+16] == '|Representative=':
                newchid = ''
                ii = i + 16
                while True:
                    newchid += seqheader[ii]
                    ii += 1
                    if seqheader[ii] == ' ':
                        headerinfo['mainid'] = newchid.upper()
                        break
            elif seqheader[i:i+8] == 'Members=':
                newchid = ''
                headerinfo['chains'] = set()
                for ii in range(i+8, len(seqheader)):
                    if seqheader[ii] == ',' or ii == len(seqheader)-1:
                        headerinfo['chains'].add(newchid)
                        newchid = ''
                    else:
                        newchid += seqheader[ii]
                break

    # PDBe
    elif seqheader.startswith('>pdb|'):
        headerinfo['source'] = 'PDBe'
        newchid = ''
        tag = 'mainid'
        for i in range(5, len(seqheader)):
            if seqheader[i] == '|':
                if tag == 'mainid':
                    headerinfo[tag] = newchid.upper()
                else:
                    headerinfo[tag] = newchid
                tag = 'chains'
                newchid = ''
            elif seqheader[i] == '_':
                if tag == 'mainid':
                    headerinfo[tag] = newchid.upper()
                else:
                    headerinfo[tag] = newchid
                tag = 'seqid'
                newchid = ''
            elif (seqheader[i] == ' ' or
                  seqheader[i] == ',' or
                  i == len(seqheader)-1):
                if i == len(seqheader)-1:
                    newchid += seqheader[i]
                if headerinfo[tag] is None:
                    headerinfo[tag] = set()
                headerinfo[tag].add(newchid)
                newchid = ''
            else:
                if tag == 'mainid':
                    newchid += seqheader[i].upper()
                else:
                    newchid += seqheader[i]

    else:
        # RCSB PDB, CROPS, MrBUMP, others
        if seqheader.startswith('>crops|'):
            seqheader = ">" + seqheader[7:]
            headerinfo['source'] = 'CROPS'
        else:
            headerinfo['source'] = 'RCSB PDB'
        for j in range(len(seqheader)):
            if seqheader[j] == ">":
                idchar = True
            elif seqheader[j] == ":" or seqheader[j] == "_":
                idchar = False
                namechar = True
            elif seqheader[j] == " ":
                if seqheader[j-2] == "_" and newchid != '':
                    try:
                        int(newchid)
                        headerinfo['seqid'] = newchid
                        newchid = ''
                    except Exception:
                        if headerinfo['chains'] is None:
                            headerinfo['chains'] = set()
                        headerinfo['chains'].add(newchid)
                        headerinfo['comments'] = seqheader[j+1:]
                        if seqheader[j+1:j+12] == "resolution:":
                            headerinfo['source'] = 'MrBUMP'
                        break
                else:
                    pass
            elif seqheader[j] == "[":
                if seqheader[j:j+5] == "[auth" or seqheader[j:j+6] == "[ auth":
                    newchid = ''
            elif ((seqheader[j] == "a" and seqheader[j:j+4] == 'auth') or
                  (seqheader[j] == "u" and seqheader[j-1:j+3] == 'auth') or
                  (seqheader[j] == "t" and seqheader[j-2:j+2] == 'auth') or
                  (seqheader[j] == "h" and seqheader[j-3:j+1] == 'auth')):
                pass
            elif seqheader[j] == "]":
                pass
            elif seqheader[j] == "|":
                if idchar is True:
                    idchar = False
                if seqheader[j+1:j+6].lower() == 'chain':
                    if newchid != '':
                        headerinfo['seqid'] = newchid
                    k = 0 if seqheader[j+6] == ' ' else 1
                    newchid = ''
                    for jj in range(j+6+k+1, len(seqheader)):
                        if seqheader[jj] == ',':
                            if headerinfo['chains'] is None:
                                headerinfo['chains'] = set()
                            headerinfo['chains'].add(newchid)
                            newchid = ''
                        elif seqheader[jj] == " ":
                            pass
                        elif seqheader[jj] == "[":
                            if (seqheader[jj:jj+5] == "[auth" or
                                    seqheader[jj:jj+6] == "[ auth"):
                                newchid = ''
                        elif ((seqheader[jj] == "a" and seqheader[jj:jj+4] == 'auth') or
                              (seqheader[jj] == "u" and seqheader[jj-1:jj+3] == 'auth') or
                              (seqheader[jj] == "t" and seqheader[jj-2:jj+2] == 'auth') or
                              (seqheader[jj] == "h" and seqheader[jj-3:jj+1] == 'auth')):
                            pass
                        elif seqheader[jj] == "]":
                            pass
                        elif (seqheader[jj] == "|" or seqheader[jj] == ":" or
                              jj == len(seqheader)-1):
                            if jj == len(seqheader)-1:
                                newchid += seqheader[jj]
                            else:
                                headerinfo['comments'] = seqheader[jj+1:]
                            if headerinfo['chains'] is None:
                                headerinfo['chains'] = set()
                            headerinfo['chains'].add(newchid)
                            newchid = ''
                            namechar = False
                            return headerinfo
                        else:
                            newchid += seqheader[jj]
                else:
                    if namechar is True:
                        headerinfo['comments'] = seqheader[j+1:]
                        if headerinfo['chains'] is None:
                            headerinfo['chains'] = set()
                        headerinfo['chains'].add(newchid)
                    else:
                        pass
                    break
            elif seqheader[j] == " " and seqheader[j-1] != "_":
                headerinfo['comments'] = seqheader[j+1:]
                if namechar is True:
                    if headerinfo['chains'] is None:
                        headerinfo['chains'] = set()
                    headerinfo['chains'].add(newchid)
                    break
            else:
                if namechar is True:
                    newchid += seqheader[j]
                elif idchar is True:
                    headerinfo['mainid'] += seqheader[j].upper()

    return headerinfo


[docs]def makeheader(mainid=None, seqid=None, chains=None,
               source=None, extrainfo=None, short=False):
    """Return a fasta header of the format ">crops|MainID_seqID|Chains chain list|extrainfo".

    :param mainid: PDB ID, Uniprot ID, etc.
    :type mainid: str
    :param seqid: Sequence Identifier, usually a natural number: "1", "2", etc, defaults to None.
    :type seqid: str, optional
    :param chains: A set containing the chain IDs of monomers sharing the same sequence, defaults to None.
    :type chains: set [str], optional
    :param source: The source of the sequence ('RCSB PDB', 'UniProtKB/SwissProt', 'PDBe', etc).
    :type source: str, optional
    :param extrainfo: Additional information to be included in the header, defaults to None.
    :type extrainfo: str, optional
    :param short: If True, a short version of the header ('>MainID_seqID|Chains chain list') is returned, defaults to False.
    :type extrainfo: bool, optional

    :raises ValueError: If any of `mainid`, `seqid`, `extrainfo` or elements of `chains` are not strings, or `chains` is not a set.

    :return: A fasta header.
    :rtype: str

    """
    if not isinstance(mainid, str):
        try:
            mainid = str(mainid)
        except Exception:
            logging.critical("Argument 'mainid' is not a string.")
            raise ValueError

    if short is True:
        newheader = '>' + mainid.upper()
    else:
        newheader = '>crops|' + mainid
    if seqid is not None:
        if not isinstance(seqid, str):
            try:
                seqid = str(seqid)
            except Exception:
                logging.critical("Argument 'seqid' is not a string.")
                raise ValueError
        newheader += '_'
        newheader += seqid
    newheader += '|'
    if chains is not None:
        if not isinstance(chains, set):
            logging.critical("Argument 'chains' is not a set of strings.")
            raise ValueError
        else:
            for element in chains:
                if not isinstance(element, str):
                    logging.critical("Argument 'chains' is not a set of strings.")
                    raise ValueError

        if len(chains) > 0:
            if len(chains) > 1:
                newheader += 'Chains '
            else:
                newheader += 'Chain '
            for element in sorted(chains):
                newheader += element
                newheader += ','
            newheader = newheader[:-1]
        if not short:
            newheader += '|'
    if not short:
        if source is None:
            source = 'Unknown'
        if not isinstance(source, str):
            try:
                source = str(source)
            except Exception:
                logging.critical("Argument 'source' is not a string.")
                raise ValueError
        newheader += 'Source: '
        newheader += source
        newheader += '|'
    if extrainfo is not None and not short:
        if not isinstance(extrainfo, str):
            try:
                extrainfo = str(extrainfo)
            except Exception:
                logging.critical('Argument extrainfo is not a list of strings.')
                raise ValueError
        newheader += extrainfo

    return newheader