Source code for bioutils.seqfetcher

# -*- coding: utf-8 -*-
"""Provides sequence fetching from NCBI and Ensembl.

"""

from __future__ import absolute_import, division, print_function, unicode_literals

import logging
import os
import random
import re
import time

import requests

_logger = logging.getLogger(__name__)

# Reece requested registration on 2017-09-03
ncbi_tool = "bioutils"
ncbi_email = "biocommons-dev@googlegroups.com"
retry_limit = 3


[docs]def fetch_seq(ac, start_i=None, end_i=None): """Fetches sequences and subsequences from NCBI eutils and Ensembl REST interfaces. Args: ac (str): The accession of the sequence to fetch. start_i (int, optional): The start index (interbase coordinates) of the subsequence to fetch. Defaults to ``None``. It is recommended to retrieve a subsequence by providing an index here, rather than by Python slicing the whole sequence. end_i (int, optional): The end index (interbase coordinates) of the subsequence to fetch. Defaults to ``None``. It is recommended to retrieve a subsequence by providing an index here, rather than by Python slicing the whole sequence. Returns: str: The requested sequence. Raises: RuntimeError: If the syntax doesn't match that of any of the databases. RuntimeError: If the request to the database fails. Examples: >>> len(fetch_seq('NP_056374.2')) 1596 >>> fetch_seq('NP_056374.2',0,10) # This! 'MESRETLSSS' >>> fetch_seq('NP_056374.2')[0:10] # Not this! 'MESRETLSSS' # Providing intervals is especially important for large sequences: >>> fetch_seq('NC_000001.10',2000000,2000030) 'ATCACACGTGCAGGAACCCTTTTCCAAAGG' # This call will pull back 30 bases plus overhead; without the # interval, one would receive 250MB of chr1 plus overhead! # Essentially any RefSeq, Genbank, BIC, or Ensembl sequence may be # fetched. >>> fetch_seq('NM_9.9') Traceback (most recent call last): ... RuntimeError: No sequence available for NM_9.9 >>> fetch_seq('QQ01234') Traceback (most recent call last): ... RuntimeError: No sequence fetcher for QQ01234 """ ac_dispatch = [ { "re": re.compile(r"^(?:AC|N[CGMPRTW])_|^[A-L]\w\d|^U\d"), "fetcher": _fetch_seq_ncbi, }, {"re": re.compile(r"^ENS[TP]\d+"), "fetcher": _fetch_seq_ensembl}, ] eligible_fetchers = [dr["fetcher"] for dr in ac_dispatch if dr["re"].match(ac)] if len(eligible_fetchers) == 0: raise RuntimeError("No sequence fetcher for {ac}".format(ac=ac)) if len(eligible_fetchers) >= 1: # pragma: nocover (no way to test) _logger.debug( "Multiple sequence fetchers found for " "{ac}; using first".format(ac=ac) ) fetcher = eligible_fetchers[0] _logger.debug("fetching {ac} with {f}".format(ac=ac, f=fetcher)) try: return fetcher(ac, start_i, end_i) except requests.RequestException as ex: raise RuntimeError("Failed to fetch {ac} ({ex})".format(ac=ac, ex=ex))
# ########################################################################### # Internal functions def _fetch_seq_ensembl(ac, start_i=None, end_i=None): """Fetch sequence slice from Ensembl public REST interface. Args: ac (str): The accession of the sequence to fetch. start_i (int, optional): The start index (interbase coordinates) of the subsequence to fetch. Defaults to None. end_i (int, optional): The end index (interbase coordinates) of the subsequence to fetch. Defaults to None. Returns: str: The requested (sub)sequence Raises: RequestException: if request is unsuccessful. KeyError: if Ensembl API returns a different version than requested Note: The Ensembl REST interface does not currently accept intervals, so this method slices the sequence locally. Examples: >> len(_fetch_seq_ensembl('ENSP00000288602')) 766 >> _fetch_seq_ensembl('ENSP00000288602',0,10) u'MAALSGGGGG' >> _fetch_seq_ensembl('ENSP00000288602')[0:10] u'MAALSGGGGG' >> ac = 'ENSP00000288602' >> _fetch_seq_ensembl(ac ,0, 10) == _fetch_seq_ensembl(ac)[0:10] True """ # Ensembl API only takes transcript IDs (without version) and returns the latest one # So we need to strip the transcript version, then check if what was returned was the one we requested version = None m = re.match(r"^(ENST\d+)\.(\d+)$", ac) if m: ac, version = m.groups() version = int(version) url = f"http://rest.ensembl.org/sequence/id/{ac}" r = requests.get(url, headers={"Content-Type": "application/json"}) r.raise_for_status() data = r.json() if version is not None: latest_version = data["version"] if version != latest_version: msg = f"Ensembl API only provides {ac} version ({latest_version}), requested: {version}" raise KeyError(msg) seq = data["seq"] return seq if (start_i is None or end_i is None) else seq[start_i:end_i] def _fetch_seq_ncbi(ac, start_i=None, end_i=None): """Fetches sequences from NCBI using the eutils interface. Args: ac (str): The accession of the sequence to fetch. start_i (int, optional): The start index (interbase coordinates) of the subsequence to fetch. Defaults to None. end_i (int, optional): The end index (interbase coordinates) of the subsequence to fetch. Defaults to None. Returns: str: The requested (sub)sequence Raises: RequestException: if request is unsuccessful. Notes: An interbase interval may be optionally provided with start_i and end_i. NCBI eutils will return just the requested subsequence, which might greatly reduce payload sizes (especially with chromosome-scale sequences). The request includes `tool` and `email` arguments to identify the caller as the bioutils package. According to https://www.ncbi.nlm.nih.gov/books/NBK25497/, these values should correspond to the library, not the library client. Using the defaults is recommended. Nonetheless, callers may set ``bioutils.seqfetcher.ncbi_tool`` and ``bioutils.seqfetcher.ncbi_email`` to custom values if that is desired. Examples: >> len(_fetch_seq_ncbi('NP_056374.2')) 1596 Pass the desired interval rather than using Python's [] slice operator. >> _fetch_seq_ncbi('NP_056374.2',0,10) 'MESRETLSSS' >> _fetch_seq_ncbi('NP_056374.2')[0:10] 'MESRETLSSS' >> _fetch_seq_ncbi('NP_056374.2',0,10) == _fetch_seq_ncbi('NP_056374.2')[0:10] True """ db = "protein" if ac[1] == "P" else "nucleotide" url_fmt = ( "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" "db={db}&id={ac}&rettype=fasta" ) if start_i is None or end_i is None: url = url_fmt.format(db=db, ac=ac) else: url_fmt += "&seq_start={start}&seq_stop={stop}" url = url_fmt.format(db=db, ac=ac, start=start_i + 1, stop=end_i) url += "&tool={tool}&email={email}".format(tool=ncbi_tool, email=ncbi_email) url = _add_eutils_api_key(url) n_retries = 0 while True: resp = requests.get(url) if resp.ok: seq = "".join(resp.text.splitlines()[1:]) return seq elif resp.status_code == 400: # Invalid sequence or start/stop position for that sequence raise RuntimeError( "Fetching sequence {} with start index {} and end index {} failed, invalid sequence " "or start or end position".format(ac, start_i, end_i) ) if n_retries >= retry_limit: break if n_retries == 0: _logger.warning("Failed to fetch {}".format(url)) sleeptime = random.randint(n_retries, 3) ** n_retries _logger.warning( "Failure {}/{}; retry in {} seconds".format( n_retries, retry_limit, sleeptime ) ) time.sleep(sleeptime) n_retries += 1 # Falls through only on failure resp.raise_for_status() def _add_eutils_api_key(url): """Adds an eutils api key to the query. Args: url (str): The query url without the api key. Returns: str: The query url with the api key, if one is stored in the environment variable ``NCBI_API_KEY``, otherwise it is unaltered. """ apikey = os.environ.get("NCBI_API_KEY") if apikey: url += "&api_key={apikey}".format(apikey=apikey) return url # So that I don't forget why I didn't use ebi too: # $ curl 'http://www.ebi.ac.uk/ena/data/view/AM407889.1&display=fasta' # >ENA|AM407889|AM407889.2 Medicago sativa partial mRNA ... # AACGTATCACACTTCTTCTCCATTTCTTTTTCTTACATCTTCTCTCTACAAATTCATTTC # Note that we requested .1, got .2. Implicit behavior bites again. if __name__ == "__main__": # pragma: nocover import doctest doctest.testmod()