Source code for crops.core.ops

"""Core operations (renumber, crop, split) are defined here."""

from crops import __prog__, __description__, __author__
from crops import __date__, __version__, __copyright__

from crops.libs import ressymbol
from crops.elements.sequences import sequence

from Bio import Align
from Bio.Align import substitution_matrices

import copy
import gemmi
import logging


[docs]def get_sequence_alignment(sequence_1, sequence_2, mode='global', open_gap_score=-11, extend_gap_score=-2): """Perform a sequence alignment using Needleman-Wunsch algorithm. :param sequence_1: First input sequence. :type sequence_1: str :param sequence_2: Second input sequence. :type sequence_2: str :param mode: Alignment mode, defaults to 'global'. :type mode: str, optional :param open_gap_score: Opening gap penalty, defaults to -11. :type open_gap_score: int, optional :param extend_gap_score: Extension gap penalty, defaults to -2. :type extend_gap_score: int, optional :return alignment_dict: Dictionary with the residue mapping between both input sequences. :rtype alignment_dict: dict [int, int] """ aligner = Align.PairwiseAligner() aligner.mode = mode aligner.substitution_matrix = substitution_matrices.load("BLOSUM62") aligner.open_gap_score = open_gap_score aligner.extend_gap_score = extend_gap_score try: alignments = list(aligner.align(sequence_1, sequence_2)) except ValueError as e: logging.warning('Needleman-Wunsch alignment failed due to wrong alphabet:\n{}'.format(e)) return None alignments.sort(key=lambda x: x.score, reverse=True) aligned_indices = alignments[0].aligned alignment_dict = {} for query_chunk, target_chunk in zip(*aligned_indices): for query_index, target_index in zip(range(*query_chunk), range(*target_chunk)): alignment_dict[target_index] = query_index return alignment_dict
[docs]def renumber_pdb_needleman(inseq, instr, seqback=False, remove_ligands=False): """Return modified :class:`gemmi.Structure` with new residue numbers. It uses Needleman-Wunsch algorithm to perform the sequence alignment. :param inseq: Input sequence. :type inseq: :class:`crops.elements.sequences.oligoseq` :param instr: Gemmi structure. :type instr: :class:`gemmi.Structure` :param seqback: If True, it additionally returns the :class:`crops.elements.sequences.oligoseq` with the gaps found in the structure, defaults to False. :type seqback: bool, optional :param remove_ligands: If True, use :func:`gemmi.Structure.remove_ligands_and_waters` and :func:`gemmi.Structure.remove_empty_chains` prior to operation, defaults to False. :type remove_ligands: bool, optional :return instr: Renumbered structure. :rtype instr: :class:`gemmi.Structure` :return inseq: Sequence with extra information about gaps, only if seqback is True. :rtype inseq: :class:`crops.elements.sequences.oligoseq` """ renumbered_structure = gemmi.Structure() renumbered_structure.name = instr.name if seqback: for monkey in inseq.imer: inseq.imer[monkey].seqs['gapseq'] = [] if remove_ligands: instr.remove_ligands_and_waters() instr.remove_empty_chains for model in instr: renumbered_model = gemmi.Model(model.name) for chain in model: renumbered_chain = gemmi.Chain(chain.name) nseq = inseq.whatseq(chain.name) original_seq = inseq.imer[nseq].seqs['mainseq'] model_seq = '' for x in chain: rs = ressymbol(x.name) if isinstance(rs, str): model_seq += rs aligned_dict = get_sequence_alignment(original_seq, model_seq) if aligned_dict is None: logging.warning('Alignment failed for chain {}, it will be excluded'.format(chain.name)) continue for index, residue in enumerate(chain): _residue = residue.clone() _residue.seqid.num = aligned_dict[index] + 1 renumbered_chain.add_residue(_residue) renumbered_model.add_chain(renumbered_chain) if seqback: res_set = {residue.seqid.num for residue in renumbered_chain} newseq = '' for n in range(len(original_seq)): if (n+1) in res_set: newseq += original_seq[n] else: newseq += '-' inseq.imer[nseq].seqs['gapseq'].append(newseq) renumbered_structure.add_model(renumbered_model) if seqback: return renumbered_structure, inseq else: return renumbered_structure
[docs]def renumber_pdb(inseq, instr, seqback=False, remove_ligands=False): """Return modified :class:`gemmi.Structure` with new residue numbers. :param inseq: Input sequence. :type inseq: :class:`crops.elements.sequences.oligoseq` :param instr: Gemmi structure. :type instr: :class:`gemmi.Structure` :param seqback: If True, it additionally returns the :class:`crops.elements.sequences.oligoseq` with the gaps found in the structure, defaults to False. :type seqback: bool, optional :param remove_ligands: If True, use :func:`gemmi.Structure.remove_ligands_and_waters` and :func:`gemmi.Structure.remove_empty_chains` prior to operation, defaults to False. :type remove_ligands: bool, optional :return instr: Renumbered structure. :rtype instr: :class:`gemmi.Structure` :return inseq: Sequence with extra information about gaps, only if seqback is True. :rtype inseq: :class:`crops.elements.sequences.oligoseq` """ n_chains = 0 n_resmax = 0 if remove_ligands: instr.remove_ligands_and_waters() instr.remove_empty_chains() for model in instr: n_chains += len(model) for chain in model: if len(chain) > n_resmax: n_resmax = len(chain) pos = [[0 for j in range(n_resmax)] for i in range(n_chains)] n_chains = 0 if seqback: for monkey in inseq.imer: inseq.imer[monkey].seqs['gapseq'] = [] for model in instr: for chain in model: solved = False nseq = inseq.whatseq(chain.name) if nseq in inseq.imer: original_seq = inseq.imer[nseq].seqs['mainseq'] for shift in range(int(len(chain)/2)): cnt = 0 gap = 0 score = 0 nligands = 0 newseq = '-'*shift for residue in chain: if str(residue.entity_type) != 'EntityType.Polymer': nligands += 1 pos[n_chains][cnt] = -nligands else: if residue != chain[0] and chain[cnt].seqid.num-chain[cnt-1].seqid.num > 1: gap += (chain[cnt].seqid.num-chain[cnt-1].seqid.num-1) newseq += '-'*(chain[cnt].seqid.num-chain[cnt-1].seqid.num-1) pos[n_chains][cnt] = cnt+1+gap+shift if (ressymbol(residue.name, pick=original_seq[cnt+gap+shift]) == original_seq[cnt+gap+shift]): score += 1 newseq += original_seq[cnt+gap+shift] else: # IF ressymbol == 0 as a result of not being in the library, it will be considered a ligand. If wrong, it might make the script crash. pdbseq = instr.get_entity(chain.name).full_sequence if cnt > len(pdbseq): nligands += 1 pos[n_chains][cnt] = -nligands else: res_pdbid = pdbseq[cnt].split(",") if len(res_pdbid) > 1: for n in range(1, len(res_pdbid)): if ressymbol(res_pdbid[n], pick=original_seq[cnt+gap+shift]) == original_seq[cnt+gap+shift]: score += 1 newseq += original_seq[cnt+gap+shift] break cnt += 1 if score == len(chain)-nligands: solved = True if len(newseq) < len(original_seq): newseq += '-'*(len(original_seq)-len(newseq)) break if solved is False: nerr = ("The .fasta sequence and the original " "structure's numbering do not match. The number " "of gaps in the structure must be consistent " "with the number of residues in the sequence " "in order for CROPS to make the renumbering.") logging.critical(nerr) raise ValueError else: nwarn = ('.pdb chain '+str(chain.name) + ' not found in ' '.fasta file. All elements considered ligands.') logging.warning(nwarn) ligandwarn = False nligands = 0 for residue in chain: if ressymbol(residue.name) != 0: ligandwarn = True nligands += 1 pos[n_chains][nligands-1] = -nligands if ligandwarn is True: nwarn = ('Some of the ligands contain Aminoacid or ' 'Nucleotide residues. Please check whether ' 'they are actually ligands.') logging.warning(nwarn) solved = True if solved: cnt = 0 for residue in chain: if pos[n_chains][cnt] < 0: residue.seqid.num = len(newseq)-pos[n_chains][cnt] else: residue.seqid.num = pos[n_chains][cnt] cnt += 1 if seqback: if nseq in inseq.imer: inseq.imer[nseq].seqs['gapseq'].append(newseq) n_chains += 1 solved = False if seqback: return instr, inseq else: return instr
[docs]def crop_seq(inseq, segments, cut_type, terms=False): """Return modified :class:`crops.elements.sequences.sequence` without specified elements. :param inseq: Input sequence. :type inseq: :class:`crops.elements.sequences.sequence` :param segments: Input preservation interval. :type segments: :class:`crops.elements.intervals.intinterval` :param cut_type: Additional header information. :type cut_type: str :param terms: If True, only terminal ends are removed, defaults to False. :type terms: bool, optional :raises `ValueError`: If intervals given lie out of the sequence. :return: Cropped sequence. :rtype: :class:`crops.elements.sequences.sequence` """ if len(segments.subint) > 0: if segments.subint[-1][-1] > inseq.length(): logging.debug('On '+str(inseq)) logging.debug('with '+str(segments)) logging.critical('One or many of the segment end values is ' 'outside the original sequence.') raise ValueError newchain = inseq.deepcopy() newchain.seqs['mainseq'] = '' newchain.seqs['fullseq'] = inseq.seqs['mainseq'] newchain.seqs['cropseq'] = '' if 'gapseq' in inseq.seqs: newchain.seqs['gapseq'] = ['']*len(inseq.seqs['gapseq']) newchain.seqs['cropgapseq'] = ['']*len(inseq.seqs['gapseq']) cropint = segments.deepcopy() if not terms else segments.union(segments.terminals()) cropmap = {} cropmap['map'] = {} cropmap['backmap'] = {} for res in range(inseq.length()): cropmap['map'][res+1] = None for res in range(inseq.length()): if cropint.contains(res+1): newchain.seqs['mainseq'] += inseq.seqs['mainseq'][res] cropmap['map'][res+1] = len(newchain.seqs['mainseq']) cropmap['backmap'][len(newchain.seqs['mainseq'])] = res+1 if 'gapseq' in inseq.seqs: for n in range(len(inseq.seqs['gapseq'])): newchain.seqs['gapseq'][n] += inseq.seqs['gapseq'][n][res] newchain.seqs['cropgapseq'][n] += inseq.seqs['gapseq'][n][res] newchain.seqs['cropseq'] += inseq.seqs['mainseq'][res] else: if 'gapseq' in inseq.seqs: for n in range(len(inseq.seqs['gapseq'])): newchain.seqs['cropgapseq'][n] += '+' newchain.seqs['cropseq'] += '+' newchain.infostring += cut_type newchain.cropmap = copy.deepcopy(cropmap['map']) newchain.cropbackmap = copy.deepcopy(cropmap['backmap']) return newchain
[docs]def crop_pdb(instr, inseq, original_id=True): """Return modified :class:`gemmi.Structure` without specified elements. :param instr: Gemmi structure. :type instr: :class:`gemmi.Structure` :param inseq: Input previously-cropped-sequence. :type inseq: :class:`crops.elements.sequences.oligoseq` :param original_id: If True, it will keep residue ids alligned to original sequence, defaults to True. :type original_id: bool, optional :return: Cropped structure. :rtype: :class:`gemmi.Structure` """ for model in instr: for chain in model: if chain.name in inseq.chainlist(): chseq = inseq.whatseq(chain.name) if inseq.imer[chseq].cropmap is not None: for res in reversed(range(len(chain))): if chain[res].seqid.num in inseq.imer[chseq].cropmap: if inseq.imer[chseq].cropmap[chain[res].seqid.num] is not None: if not original_id: chain[res].seqid.num = inseq.imer[chseq].cropmap[chain[res].seqid.num] else: del chain[res] return instr