Source code for bioutils.seqfetcher

# -*- coding: utf-8 -*-
"""Provides sequence fetching from NCBI and Ensembl.

"""

from __future__ import absolute_import, division, print_function, unicode_literals

import logging
import os
import random
import re
import time

import requests

_logger = logging.getLogger(__name__)

# Reece requested registration on 2017-09-03
ncbi_tool = "bioutils"
ncbi_email = "biocommons-dev@googlegroups.com"
retry_limit = 3


[docs]def fetch_seq(ac, start_i=None, end_i=None):
    """Fetches sequences and subsequences from NCBI eutils and Ensembl REST interfaces.

    Args:
        ac (str): The accession of the sequence to fetch.
        start_i (int, optional): The start index (interbase coordinates) of the subsequence to fetch. Defaults to ``None``.
            It is recommended to retrieve a subsequence by providing an index here, rather than by
            Python slicing the whole sequence.
        end_i (int, optional): The end index (interbase coordinates) of the subsequence to fetch. Defaults to ``None``.
            It is recommended to retrieve a subsequence by providing an index here, rather than by
            Python slicing the whole sequence.

    Returns:
        str: The requested sequence.

    Raises:
        RuntimeError: If the syntax doesn't match that of any of the databases.
        RuntimeError: If the request to the database fails.

    Examples:
        >>> len(fetch_seq('NP_056374.2'))
        1596

        >>> fetch_seq('NP_056374.2',0,10)   # This!
        'MESRETLSSS'

        >>> fetch_seq('NP_056374.2')[0:10]  # Not this!
        'MESRETLSSS'

        # Providing intervals is especially important for large sequences:

        >>> fetch_seq('NC_000001.10',2000000,2000030)
        'ATCACACGTGCAGGAACCCTTTTCCAAAGG'

        # This call will pull back 30 bases plus overhead; without the
        # interval, one would receive 250MB of chr1 plus overhead!

        # Essentially any RefSeq, Genbank, BIC, or Ensembl sequence may be
        # fetched.

        >>> fetch_seq('NM_9.9')
        Traceback (most recent call last):
        ...
        RuntimeError: No sequence available for NM_9.9

        >>> fetch_seq('QQ01234')
        Traceback (most recent call last):
        ...
        RuntimeError: No sequence fetcher for QQ01234
    """

    ac_dispatch = [
        {
            "re": re.compile(r"^(?:AC|N[CGMPRTW])_|^[A-L]\w\d|^U\d"),
            "fetcher": _fetch_seq_ncbi,
        },
        {"re": re.compile(r"^ENS[TP]\d+"), "fetcher": _fetch_seq_ensembl},
    ]

    eligible_fetchers = [dr["fetcher"] for dr in ac_dispatch if dr["re"].match(ac)]

    if len(eligible_fetchers) == 0:
        raise RuntimeError("No sequence fetcher for {ac}".format(ac=ac))

    if len(eligible_fetchers) >= 1:  # pragma: nocover (no way to test)
        _logger.debug(
            "Multiple sequence fetchers found for " "{ac}; using first".format(ac=ac)
        )

    fetcher = eligible_fetchers[0]
    _logger.debug("fetching {ac} with {f}".format(ac=ac, f=fetcher))

    try:
        return fetcher(ac, start_i, end_i)
    except requests.RequestException as ex:
        raise RuntimeError("Failed to fetch {ac} ({ex})".format(ac=ac, ex=ex))


# ###########################################################################
# Internal functions


def _fetch_seq_ensembl(ac, start_i=None, end_i=None):
    """Fetch sequence slice from Ensembl public REST interface.

    Args:
        ac (str): The accession of the sequence to fetch.
        start_i (int, optional): The start index (interbase coordinates) of the subsequence to fetch.
            Defaults to None.
        end_i (int, optional): The end index (interbase coordinates) of the subsequence to fetch.
            Defaults to None.

    Returns:
        str: The requested (sub)sequence

    Raises:
        RequestException: if request is unsuccessful.
        KeyError: if Ensembl API returns a different version than requested

    Note:
        The Ensembl REST interface does not currently accept intervals, so this method
        slices the sequence locally.

    Examples:
        >> len(_fetch_seq_ensembl('ENSP00000288602'))
        766

        >> _fetch_seq_ensembl('ENSP00000288602',0,10)
        u'MAALSGGGGG'

        >> _fetch_seq_ensembl('ENSP00000288602')[0:10]
        u'MAALSGGGGG'

        >> ac = 'ENSP00000288602'
        >> _fetch_seq_ensembl(ac ,0, 10) == _fetch_seq_ensembl(ac)[0:10]
        True
    """

    # Ensembl API only takes transcript IDs (without version) and returns the latest one
    # So we need to strip the transcript version, then check if what was returned was the one we requested
    version = None
    m = re.match(r"^(ENST\d+)\.(\d+)$", ac)
    if m:
        ac, version = m.groups()
        version = int(version)

    url = f"http://rest.ensembl.org/sequence/id/{ac}"
    r = requests.get(url, headers={"Content-Type": "application/json"})
    r.raise_for_status()
    data = r.json()
    if version is not None:
        latest_version = data["version"]
        if version != latest_version:
            msg = f"Ensembl API only provides {ac} version ({latest_version}), requested: {version}"
            raise KeyError(msg)

    seq = data["seq"]
    return seq if (start_i is None or end_i is None) else seq[start_i:end_i]


def _fetch_seq_ncbi(ac, start_i=None, end_i=None):
    """Fetches sequences from NCBI using the eutils interface.

    Args:
        ac (str): The accession of the sequence to fetch.
        start_i (int, optional): The start index (interbase coordinates) of the subsequence to fetch.
            Defaults to None.
        end_i (int, optional): The end index (interbase coordinates) of the subsequence to fetch.
            Defaults to None.

    Returns:
        str: The requested (sub)sequence

    Raises:
        RequestException: if request is unsuccessful.

    Notes:
        An interbase interval may be optionally provided with start_i and
        end_i. NCBI eutils will return just the requested subsequence,
        which might greatly reduce payload sizes (especially with
        chromosome-scale sequences).

        The request includes `tool` and `email` arguments to identify the
        caller as the bioutils package.  According to
        https://www.ncbi.nlm.nih.gov/books/NBK25497/, these values should
        correspond to the library, not the library client.  Using the
        defaults is recommended.  Nonetheless, callers may set
        ``bioutils.seqfetcher.ncbi_tool`` and
        ``bioutils.seqfetcher.ncbi_email`` to custom values if that is
        desired.

    Examples:
        >> len(_fetch_seq_ncbi('NP_056374.2'))
        1596

        Pass the desired interval rather than using Python's [] slice
        operator.

        >> _fetch_seq_ncbi('NP_056374.2',0,10)
        'MESRETLSSS'

        >> _fetch_seq_ncbi('NP_056374.2')[0:10]
        'MESRETLSSS'

        >> _fetch_seq_ncbi('NP_056374.2',0,10) == _fetch_seq_ncbi('NP_056374.2')[0:10]
        True
    """

    db = "protein" if ac[1] == "P" else "nucleotide"
    url_fmt = (
        "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
        "db={db}&id={ac}&rettype=fasta"
    )

    if start_i is None or end_i is None:
        url = url_fmt.format(db=db, ac=ac)
    else:
        url_fmt += "&seq_start={start}&seq_stop={stop}"
        url = url_fmt.format(db=db, ac=ac, start=start_i + 1, stop=end_i)

    url += "&tool={tool}&email={email}".format(tool=ncbi_tool, email=ncbi_email)

    url = _add_eutils_api_key(url)

    n_retries = 0
    while True:
        resp = requests.get(url)
        if resp.ok:
            seq = "".join(resp.text.splitlines()[1:])
            return seq
        elif resp.status_code == 400:
            # Invalid sequence or start/stop position for that sequence
            raise RuntimeError(
                "Fetching sequence {} with start index {} and end index {} failed, invalid sequence "
                "or start or end position".format(ac, start_i, end_i)
            )
        if n_retries >= retry_limit:
            break
        if n_retries == 0:
            _logger.warning("Failed to fetch {}".format(url))
        sleeptime = random.randint(n_retries, 3) ** n_retries
        _logger.warning(
            "Failure {}/{}; retry in {} seconds".format(
                n_retries, retry_limit, sleeptime
            )
        )
        time.sleep(sleeptime)
        n_retries += 1
    # Falls through only on failure
    resp.raise_for_status()


def _add_eutils_api_key(url):
    """Adds an eutils api key to the query.

    Args:
        url (str): The query url without the api key.

    Returns:
        str: The query url with the api key, if one is stored in the environment variable
            ``NCBI_API_KEY``, otherwise it is unaltered.
    """

    apikey = os.environ.get("NCBI_API_KEY")
    if apikey:
        url += "&api_key={apikey}".format(apikey=apikey)
    return url


# So that I don't forget why I didn't use ebi too:
# $ curl 'http://www.ebi.ac.uk/ena/data/view/AM407889.1&display=fasta'
# >ENA|AM407889|AM407889.2 Medicago sativa partial mRNA ...
# AACGTATCACACTTCTTCTCCATTTCTTTTTCTTACATCTTCTCTCTACAAATTCATTTC
# Note that we requested .1, got .2.  Implicit behavior bites again.

if __name__ == "__main__":  # pragma: nocover
    import doctest

    doctest.testmod()
Source code for bioutils.seqfetcher

bioutils

Navigation

Related Topics