Source code for bioutils.sequences

# -*- coding: utf-8 -*-
"""Simple functions and lookup tables for nucleic acid and amino acid sequences.
"""

import logging
import re
from string import ascii_lowercase


_logger = logging.getLogger(__name__)

aa3_to_aa1_lut = {
    "Ala": "A",
    "Arg": "R",
    "Asn": "N",
    "Asp": "D",
    "Cys": "C",
    "Gln": "Q",
    "Glu": "E",
    "Gly": "G",
    "His": "H",
    "Ile": "I",
    "Leu": "L",
    "Lys": "K",
    "Met": "M",
    "Phe": "F",
    "Pro": "P",
    "Ser": "S",
    "Thr": "T",
    "Trp": "W",
    "Tyr": "Y",
    "Val": "V",
    "Xaa": "X",
    "Ter": "*",
    "Sec": "",
}

aa1_to_aa3_lut = {v: k for k, v in aa3_to_aa1_lut.items()}

dna_to_aa1_lut = {      # NCBI standard translation table
    'AAA': 'K',
    'AAC': 'N',
    'AAG': 'K',
    'AAT': 'N',
    'ACA': 'T',
    'ACC': 'T',
    'ACG': 'T',
    'ACT': 'T',
    'AGA': 'R',
    'AGC': 'S',
    'AGG': 'R',
    'AGT': 'S',
    'ATA': 'I',
    'ATC': 'I',
    'ATG': 'M',
    'ATT': 'I',
    'CAA': 'Q',
    'CAC': 'H',
    'CAG': 'Q',
    'CAT': 'H',
    'CCA': 'P',
    'CCC': 'P',
    'CCG': 'P',
    'CCT': 'P',
    'CGA': 'R',
    'CGC': 'R',
    'CGG': 'R',
    'CGT': 'R',
    'CTA': 'L',
    'CTC': 'L',
    'CTG': 'L',
    'CTT': 'L',
    'GAA': 'E',
    'GAC': 'D',
    'GAG': 'E',
    'GAT': 'D',
    'GCA': 'A',
    'GCC': 'A',
    'GCG': 'A',
    'GCT': 'A',
    'GGA': 'G',
    'GGC': 'G',
    'GGG': 'G',
    'GGT': 'G',
    'GTA': 'V',
    'GTC': 'V',
    'GTG': 'V',
    'GTT': 'V',
    'TAA': '*',
    'TAC': 'Y',
    'TAG': '*',
    'TAT': 'Y',
    'TCA': 'S',
    'TCC': 'S',
    'TCG': 'S',
    'TCT': 'S',
    'TGA': '*',
    'TGC': 'C',
    'TGG': 'W',
    'TGT': 'C',
    'TTA': 'L',
    'TTC': 'F',
    'TTG': 'L',
    'TTT': 'F',
    # degenerate codons
    'AAR': 'K',
    'AAY': 'N',
    'ACB': 'T',
    'ACD': 'T',
    'ACH': 'T',
    'ACK': 'T',
    'ACM': 'T',
    'ACN': 'T',
    'ACR': 'T',
    'ACS': 'T',
    'ACV': 'T',
    'ACW': 'T',
    'ACY': 'T',
    'AGR': 'R',
    'AGY': 'S',
    'ATH': 'I',
    'ATM': 'I',
    'ATW': 'I',
    'ATY': 'I',
    'CAR': 'Q',
    'CAY': 'H',
    'CCB': 'P',
    'CCD': 'P',
    'CCH': 'P',
    'CCK': 'P',
    'CCM': 'P',
    'CCN': 'P',
    'CCR': 'P',
    'CCS': 'P',
    'CCV': 'P',
    'CCW': 'P',
    'CCY': 'P',
    'CGB': 'R',
    'CGD': 'R',
    'CGH': 'R',
    'CGK': 'R',
    'CGM': 'R',
    'CGN': 'R',
    'CGR': 'R',
    'CGS': 'R',
    'CGV': 'R',
    'CGW': 'R',
    'CGY': 'R',
    'CTB': 'L',
    'CTD': 'L',
    'CTH': 'L',
    'CTK': 'L',
    'CTM': 'L',
    'CTN': 'L',
    'CTR': 'L',
    'CTS': 'L',
    'CTV': 'L',
    'CTW': 'L',
    'CTY': 'L',
    'GAR': 'E',
    'GAY': 'D',
    'GCB': 'A',
    'GCD': 'A',
    'GCH': 'A',
    'GCK': 'A',
    'GCM': 'A',
    'GCN': 'A',
    'GCR': 'A',
    'GCS': 'A',
    'GCV': 'A',
    'GCW': 'A',
    'GCY': 'A',
    'GGB': 'G',
    'GGD': 'G',
    'GGH': 'G',
    'GGK': 'G',
    'GGM': 'G',
    'GGN': 'G',
    'GGR': 'G',
    'GGS': 'G',
    'GGV': 'G',
    'GGW': 'G',
    'GGY': 'G',
    'GTB': 'V',
    'GTD': 'V',
    'GTH': 'V',
    'GTK': 'V',
    'GTM': 'V',
    'GTN': 'V',
    'GTR': 'V',
    'GTS': 'V',
    'GTV': 'V',
    'GTW': 'V',
    'GTY': 'V',
    'MGA': 'R',
    'MGG': 'R',
    'MGR': 'R',
    'TAR': '*',
    'TAY': 'Y',
    'TCB': 'S',
    'TCD': 'S',
    'TCH': 'S',
    'TCK': 'S',
    'TCM': 'S',
    'TCN': 'S',
    'TCR': 'S',
    'TCS': 'S',
    'TCV': 'S',
    'TCW': 'S',
    'TCY': 'S',
    'TGY': 'C',
    'TRA': '*',
    'TTR': 'L',
    'TTY': 'F',
    'YTA': 'L',
    'YTG': 'L',
    'YTR': 'L',
}


complement_transtable = bytes.maketrans(b"ACGT", b"TGCA")


[docs]def aa_to_aa1(seq):
    """Coerces string of 1- or 3-letter amino acids to 1-letter representation.

    Args:
        seq (str): An amino acid sequence.
        
    Returns:
        str: The sequence as one of 1-letter amino acids.    

    Examples:
        >>> aa_to_aa1("CATSARELAME")
        'CATSARELAME'

        >>> aa_to_aa1("CysAlaThrSerAlaArgGluLeuAlaMetGlu")
        'CATSARELAME'

        >>> aa_to_aa1(None)
    """

    if seq is None:
        return None
    return aa3_to_aa1(seq) if looks_like_aa3_p(seq) else seq


[docs]def aa_to_aa3(seq):
    """Coerces string of 1- or 3-letter amino acids to 3-letter representation.

    Args:
        seq (str): An amino acid sequence.
        
    Returns:   
        str: The sequence as one of 3-letter amino acids.    

    Examples:
        >>> aa_to_aa3("CATSARELAME")
        'CysAlaThrSerAlaArgGluLeuAlaMetGlu'

        >>> aa_to_aa3("CysAlaThrSerAlaArgGluLeuAlaMetGlu")
        'CysAlaThrSerAlaArgGluLeuAlaMetGlu'

        >>> aa_to_aa3(None)
    """

    if seq is None:
        return None
    return aa1_to_aa3(seq) if not looks_like_aa3_p(seq) else seq


[docs]def aa1_to_aa3(seq):
    """Converts string of 1-letter amino acids to 3-letter amino acids.

    Should only be used if the format of the sequence is known; otherwise use ``aa_to_aa3()``.

    Args:
        seq (str): An amino acid sequence as 1-letter amino acids.
        
    Returns:  
        str: The sequence as 3-letter amino acids.  
    
    Raises:
        KeyError: If the sequence is not of 1-letter amino acids.
        
    Examples:
        >>> aa1_to_aa3("CATSARELAME")
        'CysAlaThrSerAlaArgGluLeuAlaMetGlu'

        >>> aa1_to_aa3(None)
    """

    if seq is None:
        return None
    return "".join(aa1_to_aa3_lut[aa1] for aa1 in seq)


[docs]def aa3_to_aa1(seq):
    """Converts string of 3-letter amino acids to 1-letter amino acids.

    Should only be used if the format of the sequence is known; otherwise use ``aa_to_aa1()``.

    Args:
        seq (str): An amino acid sequence as 3-letter amino acids.
        
    Returns:   
        str: The sequence as 1-letter amino acids.   
    
    Raises:
        KeyError: If the sequence is not of 3-letter amino acids.
    
    Examples:
        >>> aa3_to_aa1("CysAlaThrSerAlaArgGluLeuAlaMetGlu")
        'CATSARELAME'

        >>> aa3_to_aa1(None)
    """

    if seq is None:
        return None
    return "".join(aa3_to_aa1_lut[aa3]
                   for aa3 in [seq[i:i + 3] for i in range(0, len(seq), 3)])


[docs]def complement(seq):
    """Retrieves the complement of a sequence.

    Args:
        seq (str): A nucleotide sequence.
        
    Returns:    
        str: The complement of the sequence.
        
    Examples:
        >>> complement("ATCG")
        'TAGC'

        >>> complement(None)
    """

    if seq is None:
        return None
    return seq.translate(complement_transtable)


[docs]def elide_sequence(s, flank=5, elision="..."):
    """Trims the middle of the sequence, leaving the right and left flanks.

    Args:
        s (str): A sequence.
        flank (int, optional): The length of each flank. Defaults to five.
        elision (str, optional): The symbol used to represent the part trimmed. Defaults to '...'.
    
        Returns:    
            str: The sequence with the middle replaced by ``elision``.
        
    Examples:
        >>> elide_sequence("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
        'ABCDE...VWXYZ'

        >>> elide_sequence("ABCDEFGHIJKLMNOPQRSTUVWXYZ", flank=3)
        'ABC...XYZ'

        >>> elide_sequence("ABCDEFGHIJKLMNOPQRSTUVWXYZ", elision="..")
        'ABCDE..VWXYZ'

        >>> elide_sequence("ABCDEFGHIJKLMNOPQRSTUVWXYZ", flank=12)
        'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

        >>> elide_sequence("ABCDEFGHIJKLMNOPQRSTUVWXYZ", flank=12, elision=".")
        'ABCDEFGHIJKL.OPQRSTUVWXYZ'
    """

    elided_sequence_len = flank + flank + len(elision)
    if len(s) <= elided_sequence_len:
        return s
    return s[:flank] + elision + s[-flank:]


[docs]def looks_like_aa3_p(seq):
    """Indicates whether a string looks like a 3-letter AA string.

    Args:
        seq (str): A sequence.
        
    Returns:
        bool: Whether the string is of the format of a 3-letter AA string.
    """
    return (seq is not None and (len(seq) % 3 == 0) and
            (len(seq) == 0 or seq[1] in ascii_lowercase))


[docs]def normalize_sequence(seq):
    """Converts sequence to normalized representation for hashing.

    Essentially, removes whitespace and asterisks, and uppercases the string.

    Args:
        seq (str): The sequence to be normalized.
        
    Returns:
        str: The sequence as a string of uppercase letters.    
    
    Raises:
        RuntimeError: If the sequence contains non-alphabetic characters (besides '*').
        
    Examples:
        >>> normalize_sequence("ACGT")
        'ACGT'

        >>> normalize_sequence("  A C G T * ")
        'ACGT'

        >>> normalize_sequence("ACGT1")
        Traceback (most recent call last):
        ...
        RuntimeError: Normalized sequence contains non-alphabetic characters
    """

    nseq = re.sub(r"[\s\*]", "", seq).upper()
    m = re.search("[^A-Z]", nseq)
    if m:
        _logger.debug("Original sequence: " + seq)
        _logger.debug("Normalized sequence: " + nseq)
        _logger.debug("First non-[A-Z] at {}".format(m.start()))
        raise RuntimeError("Normalized sequence contains non-alphabetic characters")
    return nseq


[docs]def reverse_complement(seq):
    """Converts a sequence to its reverse complement.

    Args:
        seq (str): A nucleotide sequence.
        
    Returns:    
        str: The reverse complement of the sequence.
    
    Examples:
        >>> reverse_complement("ATCG")
        'CGAT'

        >>> reverse_complement(None)  
    """

    if seq is None:
        return None
    return "".join(reversed(complement(seq)))


[docs]def replace_t_to_u(seq):
    """Replaces the T's in a sequence with U's.

    Args:
        seq (str): A nucleotide sequence.
        
    Returns:    
        str: The sequence with the T's replaced by U's.
        
    Examples:
        >>> replace_t_to_u("ACGT")
        'ACGU'

        >>> replace_t_to_u(None)
    """

    if seq is None:
        return None
    return seq.replace("T", "U").replace("t", "u")


[docs]def replace_u_to_t(seq):
    """Replaces the U's in a sequence with T's.

    Args:
        seq (str): A nucleotide sequence.
        
    Returns: 
        str: The sequence with the U's replaced by T's.   
    
    Examples:
        >>> replace_u_to_t("ACGU")
        'ACGT'

        >>> replace_u_to_t(None)
    """

    if seq is None:
        return None
    return seq.replace("U", "T").replace("u", "t")


[docs]def translate_cds(seq, full_codons=True, ter_symbol="*"):
    """Translates a DNA or RNA sequence into a single-letter amino acid sequence.
    
    Uses the NCBI standard translation table.

    Args:
        seq (str): A nucleotide sequence.
        full_codons (bool, optional): If ``True``, forces sequence to have length 
            that is a multiple of 3 and raises an error otherwise. 
            If False, ``ter_symbol`` will be added as the last amino acid.
            This corresponds to biopython's behavior of padding the last codon with ``N``s.
            Defaults to ``True``.
        ter_symbol (str, optional): Placeholder for the last amino acid if 
            sequence length is not divisible by three and ``full_codons`` is False. 
            Defaults to ``'*'``

    Returns:
        str: The corresponding single letter amino acid sequence.    
    
    Raises:
        ValueError: If ``full_codons`` and the sequence is not a multiple of three.
        ValueError: If a codon is undefined in the table.
        
    Examples:
        >>> translate_cds("ATGCGA")
        'MR'

        >>> translate_cds("AUGCGA")
        'MR'

        >>> translate_cds(None)

        >>> translate_cds("")
        ''

        >>> translate_cds("AUGCG")
        Traceback (most recent call last):
        ...
        ValueError: Sequence length must be a multiple of three

        >>> translate_cds("AUGCG", full_codons=False)
        'M*'

        >>> translate_cds("ATGTAN")
        'MX'

        >>> translate_cds("CCN")
        'P'

        >>> translate_cds("TRA")
        '*'

        >>> translate_cds("TTNTA", full_codons=False)
        'X*'

        >>> translate_cds("CTB")
        'L'

        >>> translate_cds("AGM")
        'X'

        >>> translate_cds("GAS")
        'X'

        >>> translate_cds("CUN")
        'L'

        >>> translate_cds("AUGCGQ")
        Traceback (most recent call last):
        ...
        ValueError: Codon CGQ at position 4..6 is undefined in codon table
    """

    if seq is None:
        return None

    if len(seq) == 0:
        return ""

    if full_codons and len(seq) % 3 != 0:
        raise ValueError("Sequence length must be a multiple of three")

    seq = replace_u_to_t(seq)
    seq = seq.upper()

    protein_seq = list()
    for i in range(0, len(seq) - len(seq) % 3, 3):
        codon = seq[i:i + 3]
        try:
            aa = dna_to_aa1_lut[codon]
        except KeyError:
            # if this contains an ambiguous code, set aa to X, otherwise, throw error
            iupac_ambiguity_codes = "BDHVNUWSMKRYZ"
            if any([iupac_ambiguity_code in codon for iupac_ambiguity_code in iupac_ambiguity_codes]):
                aa = "X"
            else:
                raise ValueError("Codon {} at position {}..{} is undefined in codon table".format(
                    codon, i + 1, i + 3))
        protein_seq.append(aa)

    # check for trailing bases and add the ter symbol if required
    if not full_codons and len(seq) % 3 != 0:
        protein_seq.append(ter_symbol)

    return ''.join(protein_seq)


## <LICENSE>
## Copyright 2014 Bioutils Contributors
## 
## Licensed under the Apache License, Version 2.0 (the "License");
## you may not use this file except in compliance with the License.
## You may obtain a copy of the License at
## 
##     http://www.apache.org/licenses/LICENSE-2.0
## 
## Unless required by applicable law or agreed to in writing, software
## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## See the License for the specific language governing permissions and
## limitations under the License.
## </LICENSE>
Source code for bioutils.sequences

bioutils

Navigation

Related Topics